From 4e31a5d32ec0c237c927e8d142b0acdefff2ba55 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 11:55:02 -0500 Subject: [PATCH 001/101] Factor work directory management into a loc_stack class. --- libensemble/controller.py | 7 +- libensemble/libE_worker.py | 141 +++++++----------- libensemble/loc_stack.py | 70 +++++++++ libensemble/resources.py | 7 + .../tests/unit_tests/test_loc_stack.py | 93 ++++++++++++ 5 files changed, 227 insertions(+), 91 deletions(-) create mode 100644 libensemble/loc_stack.py create mode 100644 libensemble/tests/unit_tests/test_loc_stack.py diff --git a/libensemble/controller.py b/libensemble/controller.py index a78b531cd..c75067192 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -20,12 +20,7 @@ from libensemble.register import Register from libensemble.resources import Resources -if Resources.am_I_manager(): - wrkid = 'Manager' -else: - wrkid = 'w' + str(Resources.get_workerID()) - -logger = logging.getLogger(__name__ + '(' + wrkid + ')') +logger = logging.getLogger(__name__ + '(' + Resources.get_my_name() + ')') #For debug messages in this module - uncomment (see libE.py to change root logging level) #logger.setLevel(logging.DEBUG) diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index 1e7b5c611..d65c3108f 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -6,34 +6,46 @@ from __future__ import division from __future__ import absolute_import -import os, shutil import socket import logging import numpy as np +#Idea is dont have to have it unless using MPI option. +from mpi4py import MPI + +from libensemble.loc_stack import LocationStack + #In future these will be in CalcInfo or Comms modules #CalcInfo -from libensemble.message_numbers import EVAL_SIM_TAG, EVAL_GEN_TAG -from libensemble.message_numbers import UNSET_TAG, STOP_TAG, CALC_EXCEPTION +from libensemble.message_numbers import \ + EVAL_SIM_TAG, EVAL_GEN_TAG, \ + UNSET_TAG, STOP_TAG, CALC_EXCEPTION #Comms -from libensemble.message_numbers import MAN_SIGNAL_KILL, MAN_SIGNAL_FINISH -from libensemble.message_numbers import MAN_SIGNAL_REQ_RESEND, MAN_SIGNAL_REQ_PICKLE_DUMP +from libensemble.message_numbers import \ + MAN_SIGNAL_FINISH, \ + MAN_SIGNAL_REQ_RESEND, MAN_SIGNAL_REQ_PICKLE_DUMP + # MAN_SIGNAL_KILL from libensemble.calc_info import CalcInfo from libensemble.controller import JobController from libensemble.resources import Resources -# Rem: run on import - though Manager should never be printed - workerID/rank -if Resources.am_I_manager(): - wrkid = 'Manager' -else: - wrkid = 'w' + str(Resources.get_workerID()) - -logger = logging.getLogger(__name__ + '(' + wrkid + ')') +logger = logging.getLogger(__name__ + '(' + Resources.get_my_name() + ')') #For debug messages in this module - uncomment (see libE.py to change root logging level) #logger.setLevel(logging.DEBUG) + +def recv_dtypes(comm): + """Receive dtypes array broadcast from manager.""" + dtypes = {} + dtypes[EVAL_SIM_TAG] = None + dtypes[EVAL_GEN_TAG] = None + dtypes[EVAL_SIM_TAG] = comm.bcast(dtypes[EVAL_SIM_TAG], root=0) + dtypes[EVAL_GEN_TAG] = comm.bcast(dtypes[EVAL_GEN_TAG], root=0) + return dtypes + + #The routine worker_main currently uses MPI. Comms will be implemented using comms module in future def worker_main(c, sim_specs, gen_specs): """ @@ -53,23 +65,15 @@ def worker_main(c, sim_specs, gen_specs): """ - #Idea is dont have to have it unless using MPI option. - from mpi4py import MPI - comm = c['comm'] rank = comm.Get_rank() workerID = rank status = MPI.Status() - Worker.init_workers(sim_specs, gen_specs) # Store in Worker Class - dtypes = {} - dtypes[EVAL_SIM_TAG] = None - dtypes[EVAL_GEN_TAG] = None - dtypes[EVAL_SIM_TAG] = comm.bcast(dtypes[EVAL_SIM_TAG], root=0) - dtypes[EVAL_GEN_TAG] = comm.bcast(dtypes[EVAL_GEN_TAG], root=0) + dtypes = recv_dtypes(comm) - worker = Worker(workerID) + worker = Worker(workerID, sim_specs, gen_specs) #Setup logging logger.info("Worker {} initiated on MPI rank {} on node {}".format(workerID, rank, socket.gethostname())) @@ -78,8 +82,7 @@ def worker_main(c, sim_specs, gen_specs): CalcInfo.create_worker_statfile(worker.workerID) worker_iter = 0 - sim_iter = 0 - gen_iter = 0 + calc_iter = {EVAL_GEN_TAG : 0, EVAL_SIM_TAG : 0} #Init in case of manager request before filled worker_out = {} @@ -88,24 +91,22 @@ def worker_main(c, sim_specs, gen_specs): worker_iter += 1 logger.debug("Worker {}. Iteration {}".format(workerID, worker_iter)) - # General probe for manager communication - comm.probe(source=0, tag=MPI.ANY_TAG, status=status) + # Receive message from worker + msg = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) mtag = status.Get_tag() if mtag == STOP_TAG: #If multiple choices prob change this to MANAGER_SIGNAL_TAG or something - man_signal = comm.recv(source=0, tag=STOP_TAG, status=status) - if man_signal == MAN_SIGNAL_FINISH: #shutdown the worker + if msg == MAN_SIGNAL_FINISH: #shutdown the worker break #Need to handle manager job kill here - as well as finish - if man_signal == MAN_SIGNAL_REQ_RESEND: - #And resend + if msg == MAN_SIGNAL_REQ_RESEND: logger.debug("Worker {} re-sending to Manager with status {}".format(workerID, worker.calc_status)) comm.send(obj=worker_out, dest=0) continue - if man_signal == MAN_SIGNAL_REQ_PICKLE_DUMP: + if msg == MAN_SIGNAL_REQ_PICKLE_DUMP: # Worker is requested to dump pickle file (either for read by manager or for debugging) import pickle - pfilename = "pickled_worker_{}_sim_{}.pkl".format(workerID, sim_iter) + pfilename = "pickled_worker_{}_sim_{}.pkl".format(workerID, calc_iter[EVAL_SIM_TAG]) with open(pfilename, "wb") as f: pickle.dump(worker_out, f) with open(pfilename, "rb") as f: @@ -114,20 +115,13 @@ def worker_main(c, sim_specs, gen_specs): comm.send(obj=pfilename, dest=0) continue - else: - Work = comm.recv(buf=None, source=0, tag=MPI.ANY_TAG, status=status) - + Work = msg libE_info = Work['libE_info'] calc_type = Work['tag'] #If send components - send tag separately (dont use MPI.status!) + calc_iter[calc_type] += 1 - if calc_type == EVAL_GEN_TAG: - gen_iter += 1 - if calc_type == EVAL_SIM_TAG: - sim_iter += 1 - - calc_in = np.zeros(len(libE_info['H_rows']), dtype=dtypes[calc_type]) - if len(calc_in) > 0: - calc_in = comm.recv(buf=None, source=0) + calc_in = (comm.recv(source=0) if len(libE_info['H_rows']) > 0 + else np.zeros(0, dtype=dtypes[calc_type])) logger.debug("Worker {} received calc_in of len {}".format(workerID, np.size(calc_in))) #This is current kluge for persistent worker - comm will be in the future comms module... @@ -171,21 +165,8 @@ class Worker(): """The Worker Class provides methods for controlling sim and gen funcs""" - #Class attributes - sim_specs = {} - gen_specs = {} - - #Class methods - @classmethod - def init_workers(Worker, sim_specs_in, gen_specs_in): - """Sets class attributes Worker.sim_specs and Worker.gen_specs""" - - #Class attributes? Maybe should be worker specific?? - Worker.sim_specs = sim_specs_in - Worker.gen_specs = gen_specs_in - # Worker Object methods - def __init__(self, workerID): + def __init__(self, workerID, sim_specs, gen_specs): """Initialise new worker object. Parameters @@ -196,6 +177,9 @@ def __init__(self, workerID): """ + self.sim_specs = sim_specs + self.gen_specs = gen_specs + self.locations = {} self.worker_dir = "" self.workerID = workerID @@ -211,15 +195,13 @@ def __init__(self, workerID): self.libE_info = None self.calc_stats = None - if 'sim_dir' in Worker.sim_specs: - self.worker_dir = Worker.sim_specs['sim_dir'] + '_' + str(self.workerID) - - if 'sim_dir_prefix' in Worker.sim_specs: - self.worker_dir = os.path.join(os.path.expanduser(Worker.sim_specs['sim_dir_prefix']), os.path.split(os.path.abspath(os.path.expanduser(self.worker_dir)))[1]) - - assert ~os.path.isdir(self.worker_dir), "Worker directory already exists." - shutil.copytree(Worker.sim_specs['sim_dir'], self.worker_dir) - self.locations[EVAL_SIM_TAG] = self.worker_dir + self.loc_stack = LocationStack() + if 'sim_dir' in self.sim_specs: + sim_dir = self.sim_specs['sim_dir'] + prefix = self.sim_specs.get('sim_dir_prefix') + worker_dir = "{}_{}".format(sim_dir, self.workerID) + self.worker_dir = self.loc_stack.register_loc(EVAL_SIM_TAG, worker_dir, + prefix=prefix, srcdir=sim_dir) #Optional - set workerID in job_controller - so will be added to jobnames and accesible to calcs try: @@ -281,33 +263,30 @@ def run(self, Work, calc_in): def clean(self): """Clean up calculation directories""" - for loc in self.locations.values(): - shutil.rmtree(loc) + self.loc_stack.clean_locs() def _perform_calc(self, calc_in, persis_info, libE_info): - if self.calc_type in self.locations: - saved_dir = os.getcwd() - os.chdir(self.locations[self.calc_type]) - ### ============================== Run calc ==================================== # This is in a try/except block to allow handling if exception is raised in user code # Currently report exception to summary file and pass exception up (where libE will mpi_abort) # Continuation of ensemble may be added as an option. + self.loc_stack.push_loc(self.calc_type) try: if self.calc_type == EVAL_SIM_TAG: - out = Worker.sim_specs['sim_f'](calc_in, persis_info, Worker.sim_specs, libE_info) + out = self.sim_specs['sim_f'](calc_in, persis_info, self.sim_specs, libE_info) else: - out = Worker.gen_specs['gen_f'](calc_in, persis_info, Worker.gen_specs, libE_info) + out = self.gen_specs['gen_f'](calc_in, persis_info, self.gen_specs, libE_info) except Exception as e: # Write to workers summary file and pass exception up - if self.calc_type in self.locations: - os.chdir(saved_dir) self.calc_stats.stop_timer() self.calc_status = CALC_EXCEPTION self.calc_stats.set_calc_status(self.calc_status) CalcInfo.add_calc_worker_statfile(calc=self.calc_stats) raise + finally: + # Pop the directory with or without an exception + self.loc_stack.pop() ### ============================================================================ assert isinstance(out, tuple), "Calculation output must be a tuple. Worker exiting" @@ -315,13 +294,5 @@ def _perform_calc(self, calc_in, persis_info, libE_info): H = out[0] persis_info = out[1] - - calc_tag = UNSET_TAG #None - if len(out) >= 3: - calc_tag = out[2] - - if self.calc_type in self.locations: - os.chdir(saved_dir) - - #return data_out, calc_tag + calc_tag = out[2] if len(out) >= 3 else UNSET_TAG return H, persis_info, libE_info, calc_tag diff --git a/libensemble/loc_stack.py b/libensemble/loc_stack.py new file mode 100644 index 000000000..c0ecdc99a --- /dev/null +++ b/libensemble/loc_stack.py @@ -0,0 +1,70 @@ +""" +libensemble utility class -- keeps a stack of directory locations. +""" + +import os +import shutil + +class LocationStack: + """Keep a stack of directory locations. + """ + + def __init__(self): + """Initialize the location dictionary and directory stack.""" + self.dirs = {} + self.stack = [] + + def register_loc(self, key, dirname, prefix=None, srcdir=None): + """Register a new location in the dictionary. + + Parameters + ---------- + + key: + The key used to identify the new location. + + dirname: string: + Directory name + + prefix: string: + Prefix to be used with the dirname. If prefix is not None, + only the base part of the dirname is used. + + srcdir: string: + Name of a source directory to populate the new location. + If srcdir is not None, the directory should not yet exist. + srcdir is not relative to prefix. + """ + if prefix is not None: + prefix = os.path.expanduser(prefix) + dirname = os.path.join(prefix, os.path.basename(dirname)) + self.dirs[key] = dirname + if srcdir is not None: + assert ~os.path.isdir(dirname), \ + "Directory {} already exists".format(dirname) + shutil.copytree(srcdir, dirname) + return dirname + + def push_loc(self, key): + """Push a location from the dictionary.""" + self.push(self.dirs.get(key)) + + def clean_locs(self): + """Remove all directories listed in the dictionary.""" + for dirname in self.dirs.values(): + if dirname is not None and os.path.isdir(dirname): + shutil.rmtree(dirname) + + def push(self, dirname): + """Push the current location and change directories (if not None).""" + if dirname is not None: + self.stack.append(os.getcwd()) + os.chdir(dirname) + else: + self.stack.append(None) + + def pop(self): + """Pop the current directory and change back.""" + dirname = self.stack.pop() + if dirname is not None: + os.chdir(dirname) diff --git a/libensemble/resources.py b/libensemble/resources.py index 061e42e30..6ed83e5d2 100644 --- a/libensemble/resources.py +++ b/libensemble/resources.py @@ -126,6 +126,13 @@ def get_num_workers(): num_workers = MPI.COMM_WORLD.Get_size() - 1 return num_workers + @staticmethod + def get_my_name(): + """Return name string""" + if Resources.am_I_manager(): + return 'Manager' + return 'w{}'.format(Resources.get_workerID()) + #Call from all libE tasks (pref. inc. manager) @staticmethod def get_libE_nodes(): diff --git a/libensemble/tests/unit_tests/test_loc_stack.py b/libensemble/tests/unit_tests/test_loc_stack.py new file mode 100644 index 000000000..cc4c59ed3 --- /dev/null +++ b/libensemble/tests/unit_tests/test_loc_stack.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python + +""" +Unit test of location stack for libensemble. +""" + +import os +import shutil +import tempfile + +from libensemble.loc_stack import LocationStack + +def test_location_stack(): + "Test correctness of location stack (all in a temp dir)." + + tmp_dirname = tempfile.mkdtemp() + assert os.path.isdir(tmp_dirname), \ + "Failed to create temporary directory {}.".format(tmp_dirname) + + try: + # Record where we started + start_dir = os.getcwd() + + # Set up directory for clone + clone_dirname = os.path.join(tmp_dirname, "basedir") + os.mkdir(clone_dirname) + test_fname = os.path.join(clone_dirname, "test.txt") + with open(test_fname, "w+") as f: + f.write("This is a test file\n") + + s = LocationStack() + + # Register a valid location + tname = s.register_loc(0, "testdir", + prefix=tmp_dirname, + srcdir=clone_dirname) + assert os.path.isdir(tname), \ + "New directory {} was not created.".format(tname) + assert os.path.isfile(os.path.join(tname, "test.txt")), \ + "New directory {} failed to copy test.txt from {}.". \ + format(tname, clone_dirname) + + # Register an empty location + d = s.register_loc(1, None) + assert d is None, \ + "Dir stack not correctly register None at location 1." + + # Register a dummy location (del should not work) + d = s.register_loc(2, os.path.join(tmp_dirname, "dummy")) + assert ~os.path.isdir(d), \ + "Directory stack registration of dummy should not create dir." + + # Push unregistered location (we should not move) + s.push_loc(3) + assert s.stack == [None], \ + "Directory stack push_loc(missing) failed to put None on stack." + assert os.path.samefile(os.getcwd(), start_dir), \ + "Directory stack push_loc failed to stay put with input None." \ + "Wanted {}, at {}".format(start_dir, os.getcwd()) + + # Push registered location (we should move + s.push_loc(0) + assert s.stack == [None, start_dir], \ + "Directory stack is incorrect." \ + "Wanted [None, {}], got {}.".format(start_dir, s.stack) + assert os.path.samefile(os.getcwd(), tname), \ + "Directory stack push_loc failed to end up at desired dir." \ + "Wanted {}, at {}".format(tname, os.getcwd()) + + # Pop the registered location + s.pop() + assert s.stack == [None], \ + "Directory stack is incorrect after pop." \ + "Wanted [None], got {}.".format(s.stack) + assert os.path.samefile(os.getcwd(), start_dir), \ + "Directory stack push_loc failed to stay put with input None." \ + "Wanted {}, at {}".format(start_dir, os.getcwd()) + + # Pop the unregistered location + s.pop() + assert not s.stack, \ + "Directory stack should be empty, actually {}.".format(s.stack) + assert os.path.samefile(os.getcwd(), start_dir), \ + "Directory stack push_loc failed to stay put with input None." \ + "Wanted {}, at {}".format(start_dir, os.getcwd()) + + # Clean up + s.clean_locs() + assert not os.path.isdir(tname), \ + "Directory {} should have been removed on cleanup.".format(tname) + + finally: + shutil.rmtree(tmp_dirname) From 4f0163d0177b1aedc0dfa006a6f0a9f367211ac8 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 12:06:23 -0500 Subject: [PATCH 002/101] Removed worker attributes that now belong in location stack. --- libensemble/libE_worker.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index d65c3108f..eae6b0ac1 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -177,13 +177,10 @@ def __init__(self, workerID, sim_specs, gen_specs): """ + self.workerID = workerID self.sim_specs = sim_specs self.gen_specs = gen_specs - self.locations = {} - self.worker_dir = "" - self.workerID = workerID - self.calc_out = {} self.calc_type = None self.calc_status = UNSET_TAG #From message_numbers @@ -200,8 +197,8 @@ def __init__(self, workerID, sim_specs, gen_specs): sim_dir = self.sim_specs['sim_dir'] prefix = self.sim_specs.get('sim_dir_prefix') worker_dir = "{}_{}".format(sim_dir, self.workerID) - self.worker_dir = self.loc_stack.register_loc(EVAL_SIM_TAG, worker_dir, - prefix=prefix, srcdir=sim_dir) + self.loc_stack.register_loc(EVAL_SIM_TAG, worker_dir, + prefix=prefix, srcdir=sim_dir) #Optional - set workerID in job_controller - so will be added to jobnames and accesible to calcs try: From bd90715fdfbb7f1adb9ae04bae7ece75e95afc9f Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 12:52:03 -0500 Subject: [PATCH 003/101] Moving things around in libE_worker. --- libensemble/libE_worker.py | 70 +++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index eae6b0ac1..b5b8e120a 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -10,23 +10,16 @@ import logging import numpy as np -#Idea is dont have to have it unless using MPI option. from mpi4py import MPI -from libensemble.loc_stack import LocationStack - -#In future these will be in CalcInfo or Comms modules -#CalcInfo from libensemble.message_numbers import \ EVAL_SIM_TAG, EVAL_GEN_TAG, \ UNSET_TAG, STOP_TAG, CALC_EXCEPTION - -#Comms from libensemble.message_numbers import \ MAN_SIGNAL_FINISH, \ MAN_SIGNAL_REQ_RESEND, MAN_SIGNAL_REQ_PICKLE_DUMP - # MAN_SIGNAL_KILL +from libensemble.loc_stack import LocationStack from libensemble.calc_info import CalcInfo from libensemble.controller import JobController from libensemble.resources import Resources @@ -178,8 +171,6 @@ def __init__(self, workerID, sim_specs, gen_specs): """ self.workerID = workerID - self.sim_specs = sim_specs - self.gen_specs = gen_specs self.calc_out = {} self.calc_type = None @@ -192,23 +183,48 @@ def __init__(self, workerID, sim_specs, gen_specs): self.libE_info = None self.calc_stats = None - self.loc_stack = LocationStack() - if 'sim_dir' in self.sim_specs: - sim_dir = self.sim_specs['sim_dir'] - prefix = self.sim_specs.get('sim_dir_prefix') + self._run_calc = Worker._make_runners(sim_specs, gen_specs) + self.loc_stack = self._make_sim_worker_dir(sim_specs) + self.job_controller_set = self._set_job_controller() + + + def _make_sim_worker_dir(self, sim_specs, locs=None): + "Create a dir for sim workers if 'sim_dir' is in sim_specs" + locs = locs or LocationStack() + if 'sim_dir' in sim_specs: + sim_dir = sim_specs['sim_dir'] + prefix = sim_specs.get('sim_dir_prefix') worker_dir = "{}_{}".format(sim_dir, self.workerID) - self.loc_stack.register_loc(EVAL_SIM_TAG, worker_dir, - prefix=prefix, srcdir=sim_dir) + locs.register_loc(EVAL_SIM_TAG, worker_dir, + prefix=prefix, srcdir=sim_dir) + return locs + + + @staticmethod + def _make_runners(sim_specs, gen_specs): + "Create functions to run a sim or gen" + + def run_sim(calc_in, persis_info, libE_info): + "Run a sim calculation" + return sim_specs['sim_f'](calc_in, persis_info, sim_specs, libE_info) + + def run_gen(calc_in, persis_info, libE_info): + "Run a gen calculation" + return gen_specs['gen_f'](calc_in, persis_info, gen_specs, libE_info) + + return {EVAL_SIM_TAG: run_sim, EVAL_GEN_TAG: run_gen} + - #Optional - set workerID in job_controller - so will be added to jobnames and accesible to calcs + def _set_job_controller(self): + "Optional -- set worker ID in the job controller, return if set" try: jobctl = JobController.controller - jobctl.set_workerID(workerID) + jobctl.set_workerID(self.workerID) except Exception: - logger.info("No job_controller set on worker {}".format(workerID)) - self.job_controller_set = False + logger.info("No job_controller set on worker {}".format(self.workerID)) + return False else: - self.job_controller_set = True + return True def run(self, Work, calc_in): @@ -264,27 +280,17 @@ def clean(self): def _perform_calc(self, calc_in, persis_info, libE_info): - ### ============================== Run calc ==================================== - # This is in a try/except block to allow handling if exception is raised in user code - # Currently report exception to summary file and pass exception up (where libE will mpi_abort) - # Continuation of ensemble may be added as an option. self.loc_stack.push_loc(self.calc_type) try: - if self.calc_type == EVAL_SIM_TAG: - out = self.sim_specs['sim_f'](calc_in, persis_info, self.sim_specs, libE_info) - else: - out = self.gen_specs['gen_f'](calc_in, persis_info, self.gen_specs, libE_info) + out = self._run_calc[self.calc_type](calc_in, persis_info, libE_info) except Exception as e: - # Write to workers summary file and pass exception up self.calc_stats.stop_timer() self.calc_status = CALC_EXCEPTION self.calc_stats.set_calc_status(self.calc_status) CalcInfo.add_calc_worker_statfile(calc=self.calc_stats) raise finally: - # Pop the directory with or without an exception self.loc_stack.pop() - ### ============================================================================ assert isinstance(out, tuple), "Calculation output must be a tuple. Worker exiting" assert len(out) >= 2, "Calculation output must be at least two elements when a tuple" From 138455613d4f1c6234ef9e903f6bdd0af82037d1 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 13:11:22 -0500 Subject: [PATCH 004/101] Merging and munging in libE_worker. --- libensemble/libE_worker.py | 49 ++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index b5b8e120a..a205ae782 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -175,9 +175,7 @@ def __init__(self, workerID, sim_specs, gen_specs): self.calc_out = {} self.calc_type = None self.calc_status = UNSET_TAG #From message_numbers - self.isdone = False self.calc_list = [] - self.job_controller_set = False self.persis_info = None self.libE_info = None @@ -185,7 +183,7 @@ def __init__(self, workerID, sim_specs, gen_specs): self._run_calc = Worker._make_runners(sim_specs, gen_specs) self.loc_stack = self._make_sim_worker_dir(sim_specs) - self.job_controller_set = self._set_job_controller() + self._set_job_controller() def _make_sim_worker_dir(self, sim_specs, locs=None): @@ -244,10 +242,7 @@ def run(self, Work, calc_in): """ - #Reset run specific attributes - these should maybe be in a calc object - self.calc_out = {} - self.calc_status = UNSET_TAG #From message_numbers - self.isdone = False + # calc_stats stores timing and summary info for this Calc (sim or gen) self.calc_stats = CalcInfo() @@ -262,28 +257,14 @@ def run(self, Work, calc_in): self.calc_stats.calc_type = Work['tag'] self.persis_info = Work['persis_info'] - assert self.calc_type in [EVAL_SIM_TAG, EVAL_GEN_TAG], "calc_type must either be EVAL_SIM_TAG or EVAL_GEN_TAG" - - self.calc_out, self.persis_info, self.libE_info, self.calc_status = self._perform_calc(calc_in, self.persis_info, self.libE_info) - - #This is a libe feature that is to be reviewed for best solution - #Should atleast put in calc_stats. - self.calc_stats.set_calc_status(self.calc_status) - - self.isdone = True - self.calc_stats.stop_timer() - + assert self.calc_type in [EVAL_SIM_TAG, EVAL_GEN_TAG], \ + "calc_type must either be EVAL_SIM_TAG or EVAL_GEN_TAG" - def clean(self): - """Clean up calculation directories""" - self.loc_stack.clean_locs() - - - def _perform_calc(self, calc_in, persis_info, libE_info): self.loc_stack.push_loc(self.calc_type) try: - out = self._run_calc[self.calc_type](calc_in, persis_info, libE_info) + out = self._run_calc[self.calc_type](calc_in, self.persis_info, self.libE_info) except Exception as e: + self.calc_out = {} self.calc_stats.stop_timer() self.calc_status = CALC_EXCEPTION self.calc_stats.set_calc_status(self.calc_status) @@ -295,7 +276,17 @@ def _perform_calc(self, calc_in, persis_info, libE_info): assert isinstance(out, tuple), "Calculation output must be a tuple. Worker exiting" assert len(out) >= 2, "Calculation output must be at least two elements when a tuple" - H = out[0] - persis_info = out[1] - calc_tag = out[2] if len(out) >= 3 else UNSET_TAG - return H, persis_info, libE_info, calc_tag + self.calc_out = out[0] + self.persis_info = out[1] + self.calc_status = out[2] if len(out) >= 3 else UNSET_TAG + + #This is a libe feature that is to be reviewed for best solution + #Should atleast put in calc_stats. + self.calc_stats.set_calc_status(self.calc_status) + + self.calc_stats.stop_timer() + + + def clean(self): + """Clean up calculation directories""" + self.loc_stack.clean_locs() From 03b3f8fe0815270802e825afac6f793f60834890 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 13:35:57 -0500 Subject: [PATCH 005/101] More moving around in libE_worker + add contexts for dir stuff. --- libensemble/libE_worker.py | 31 +++++++++---------------------- libensemble/loc_stack.py | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index a205ae782..b06fc3536 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -127,8 +127,6 @@ def worker_main(c, sim_specs, gen_specs): if worker.libE_info.get('persistent'): del worker.libE_info['comm'] - CalcInfo.add_calc_worker_statfile(calc=worker.calc_list[-1]) - #Check if sim/gen func recieved a finish signal... #Currently this means do not send data back first if worker.calc_status == MAN_SIGNAL_FINISH: @@ -152,8 +150,6 @@ def worker_main(c, sim_specs, gen_specs): # Worker Class ###################################################################### -# All routines in Worker Class have no MPI and can be called regardless of worker -# concurrency mode. class Worker(): """The Worker Class provides methods for controlling sim and gen funcs""" @@ -242,8 +238,6 @@ def run(self, Work, calc_in): """ - - # calc_stats stores timing and summary info for this Calc (sim or gen) self.calc_stats = CalcInfo() self.calc_list.append(self.calc_stats) @@ -263,29 +257,22 @@ def run(self, Work, calc_in): self.loc_stack.push_loc(self.calc_type) try: out = self._run_calc[self.calc_type](calc_in, self.persis_info, self.libE_info) + assert isinstance(out, tuple), "Calculation output must be a tuple. Worker exiting" + assert len(out) >= 2, "Calculation output must be at least two elements when a tuple" except Exception as e: self.calc_out = {} - self.calc_stats.stop_timer() self.calc_status = CALC_EXCEPTION - self.calc_stats.set_calc_status(self.calc_status) - CalcInfo.add_calc_worker_statfile(calc=self.calc_stats) raise + else: + self.calc_out = out[0] + self.persis_info = out[1] + self.calc_status = out[2] if len(out) >= 3 else UNSET_TAG finally: + self.calc_stats.stop_timer() + self.calc_stats.set_calc_status(self.calc_status) + CalcInfo.add_calc_worker_statfile(calc=self.calc_stats) self.loc_stack.pop() - assert isinstance(out, tuple), "Calculation output must be a tuple. Worker exiting" - assert len(out) >= 2, "Calculation output must be at least two elements when a tuple" - - self.calc_out = out[0] - self.persis_info = out[1] - self.calc_status = out[2] if len(out) >= 3 else UNSET_TAG - - #This is a libe feature that is to be reviewed for best solution - #Should atleast put in calc_stats. - self.calc_stats.set_calc_status(self.calc_status) - - self.calc_stats.stop_timer() - def clean(self): """Clean up calculation directories""" diff --git a/libensemble/loc_stack.py b/libensemble/loc_stack.py index c0ecdc99a..273fb98ad 100644 --- a/libensemble/loc_stack.py +++ b/libensemble/loc_stack.py @@ -68,3 +68,22 @@ def pop(self): dirname = self.stack.pop() if dirname is not None: os.chdir(dirname) + + class Saved: + """Context object for use with a with statement""" + def __init__(self, ls, dirname): + self.ls = ls + self.dirname = dirname + def __enter__(self): + self.ls.push(dirname) + return self.ls + def __exit__(self, type, vaulue, traceback) + self.ls.pop() + + def loc(self, key): + """Return a with context for pushing a location key""" + return LocationStack.Saved(self, self.dirs.get(key)) + + def dir(self, dirname): + """Return a with context for pushing a """ + return LocationStack.Saved(self, dirname) From 23cbebbe20360db63c21cdc7f99d38160772e596 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 13:49:34 -0500 Subject: [PATCH 006/101] Added context manager for location, incorporate into libE_worker. --- libensemble/libE_worker.py | 5 ++-- libensemble/loc_stack.py | 6 ++--- .../tests/unit_tests/test_loc_stack.py | 23 +++++++++++++++++++ 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index b06fc3536..943143e7b 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -254,9 +254,9 @@ def run(self, Work, calc_in): assert self.calc_type in [EVAL_SIM_TAG, EVAL_GEN_TAG], \ "calc_type must either be EVAL_SIM_TAG or EVAL_GEN_TAG" - self.loc_stack.push_loc(self.calc_type) try: - out = self._run_calc[self.calc_type](calc_in, self.persis_info, self.libE_info) + with self.loc_stack.loc(self.calc_type) + out = self._run_calc[self.calc_type](calc_in, self.persis_info, self.libE_info) assert isinstance(out, tuple), "Calculation output must be a tuple. Worker exiting" assert len(out) >= 2, "Calculation output must be at least two elements when a tuple" except Exception as e: @@ -271,7 +271,6 @@ def run(self, Work, calc_in): self.calc_stats.stop_timer() self.calc_stats.set_calc_status(self.calc_status) CalcInfo.add_calc_worker_statfile(calc=self.calc_stats) - self.loc_stack.pop() def clean(self): diff --git a/libensemble/loc_stack.py b/libensemble/loc_stack.py index 273fb98ad..092b518a4 100644 --- a/libensemble/loc_stack.py +++ b/libensemble/loc_stack.py @@ -75,9 +75,9 @@ def __init__(self, ls, dirname): self.ls = ls self.dirname = dirname def __enter__(self): - self.ls.push(dirname) + self.ls.push(self.dirname) return self.ls - def __exit__(self, type, vaulue, traceback) + def __exit__(self, etype, value, traceback): self.ls.pop() def loc(self, key): @@ -85,5 +85,5 @@ def loc(self, key): return LocationStack.Saved(self, self.dirs.get(key)) def dir(self, dirname): - """Return a with context for pushing a """ + """Return a with context for pushing a directory""" return LocationStack.Saved(self, dirname) diff --git a/libensemble/tests/unit_tests/test_loc_stack.py b/libensemble/tests/unit_tests/test_loc_stack.py index cc4c59ed3..217a3b8e1 100644 --- a/libensemble/tests/unit_tests/test_loc_stack.py +++ b/libensemble/tests/unit_tests/test_loc_stack.py @@ -76,6 +76,29 @@ def test_location_stack(): "Directory stack push_loc failed to stay put with input None." \ "Wanted {}, at {}".format(start_dir, os.getcwd()) + # Context for moving again + with s.loc(0): + assert s.stack == [None, start_dir], \ + "Directory stack is incorrect." \ + "Wanted [None, {}], got {}.".format(start_dir, s.stack) + assert os.path.samefile(os.getcwd(), tname), \ + "Directory stack push_loc failed to end up at desired dir." \ + "Wanted {}, at {}".format(tname, os.getcwd()) + + # Check directory after context + assert s.stack == [None], \ + "Directory stack is incorrect after ctx." \ + "Wanted [None], got {}.".format(s.stack) + assert os.path.samefile(os.getcwd(), start_dir), \ + "Directory looks wrong after ctx." \ + "Wanted {}, at {}".format(start_dir, os.getcwd()) + + with s.dir(None): + assert s.stack == [None,None], \ + "Directory stack is incorrect in ctx." + assert s.stack == [None], \ + "Directory stack is incorrect after ctx." + # Pop the unregistered location s.pop() assert not s.stack, \ From f4a9316bc0eafeeb62a716609725118926400d44 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 14:13:03 -0500 Subject: [PATCH 007/101] Fix syntax goof and move things around. --- libensemble/libE_worker.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index 943143e7b..b9f54cd10 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -237,6 +237,8 @@ def run(self, Work, calc_in): Rows from the :ref:`history array` for processing """ + assert Work['tag'] in [EVAL_SIM_TAG, EVAL_GEN_TAG], \ + "calc_type must either be EVAL_SIM_TAG or EVAL_GEN_TAG" # calc_stats stores timing and summary info for this Calc (sim or gen) self.calc_stats = CalcInfo() @@ -251,22 +253,18 @@ def run(self, Work, calc_in): self.calc_stats.calc_type = Work['tag'] self.persis_info = Work['persis_info'] - assert self.calc_type in [EVAL_SIM_TAG, EVAL_GEN_TAG], \ - "calc_type must either be EVAL_SIM_TAG or EVAL_GEN_TAG" - try: - with self.loc_stack.loc(self.calc_type) + with self.loc_stack.loc(self.calc_type): out = self._run_calc[self.calc_type](calc_in, self.persis_info, self.libE_info) assert isinstance(out, tuple), "Calculation output must be a tuple. Worker exiting" assert len(out) >= 2, "Calculation output must be at least two elements when a tuple" + self.calc_out = out[0] + self.persis_info = out[1] + self.calc_status = out[2] if len(out) >= 3 else UNSET_TAG except Exception as e: self.calc_out = {} self.calc_status = CALC_EXCEPTION raise - else: - self.calc_out = out[0] - self.persis_info = out[1] - self.calc_status = out[2] if len(out) >= 3 else UNSET_TAG finally: self.calc_stats.stop_timer() self.calc_stats.set_calc_status(self.calc_status) From 069afaa4e4ec508d23294c4cc53de56e2dcaf6f3 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 14:21:34 -0500 Subject: [PATCH 008/101] Changed calc_stats from attribute to local var in worker.run --- libensemble/libE_worker.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index b9f54cd10..6109bbfa2 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -175,7 +175,6 @@ def __init__(self, workerID, sim_specs, gen_specs): self.persis_info = None self.libE_info = None - self.calc_stats = None self._run_calc = Worker._make_runners(sim_specs, gen_specs) self.loc_stack = self._make_sim_worker_dir(sim_specs) @@ -241,17 +240,17 @@ def run(self, Work, calc_in): "calc_type must either be EVAL_SIM_TAG or EVAL_GEN_TAG" # calc_stats stores timing and summary info for this Calc (sim or gen) - self.calc_stats = CalcInfo() - self.calc_list.append(self.calc_stats) + calc_stats = CalcInfo() + self.calc_list.append(calc_stats) #Timing will include setup/teardown - self.calc_stats.start_timer() + calc_stats.start_timer() #Could keep all this inside the Work dictionary if sending all Work ... self.libE_info = Work['libE_info'] self.calc_type = Work['tag'] - self.calc_stats.calc_type = Work['tag'] self.persis_info = Work['persis_info'] + calc_stats.calc_type = self.calc_type try: with self.loc_stack.loc(self.calc_type): @@ -266,9 +265,9 @@ def run(self, Work, calc_in): self.calc_status = CALC_EXCEPTION raise finally: - self.calc_stats.stop_timer() - self.calc_stats.set_calc_status(self.calc_status) - CalcInfo.add_calc_worker_statfile(calc=self.calc_stats) + calc_stats.stop_timer() + calc_stats.set_calc_status(self.calc_status) + CalcInfo.add_calc_worker_statfile(calc=calc_stats) def clean(self): From 1740737ebd8d763c1e05ca2bdbc640425adf9c81 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 14:35:50 -0500 Subject: [PATCH 009/101] Reformatting in libE_worker. --- libensemble/libE_worker.py | 59 ++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index 6109bbfa2..d7778b489 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -25,7 +25,8 @@ from libensemble.resources import Resources logger = logging.getLogger(__name__ + '(' + Resources.get_my_name() + ')') -#For debug messages in this module - uncomment (see libE.py to change root logging level) +#For debug messages in this module - uncomment +# (see libE.py to change root logging level) #logger.setLevel(logging.DEBUG) @@ -39,7 +40,8 @@ def recv_dtypes(comm): return dtypes -#The routine worker_main currently uses MPI. Comms will be implemented using comms module in future +#The routine worker_main currently uses MPI. +#Comms will be implemented using comms module in future def worker_main(c, sim_specs, gen_specs): """ Evaluate calculations given to it by the manager. @@ -69,7 +71,8 @@ def worker_main(c, sim_specs, gen_specs): worker = Worker(workerID, sim_specs, gen_specs) #Setup logging - logger.info("Worker {} initiated on MPI rank {} on node {}".format(workerID, rank, socket.gethostname())) + logger.info("Worker {} initiated on MPI rank {} on node {}". \ + format(workerID, rank, socket.gethostname())) # Print calc_list on-the-fly CalcInfo.create_worker_statfile(worker.workerID) @@ -87,19 +90,22 @@ def worker_main(c, sim_specs, gen_specs): # Receive message from worker msg = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) mtag = status.Get_tag() - if mtag == STOP_TAG: #If multiple choices prob change this to MANAGER_SIGNAL_TAG or something + if mtag == STOP_TAG: # Change this to MANAGER_SIGNAL_TAG or something if msg == MAN_SIGNAL_FINISH: #shutdown the worker break #Need to handle manager job kill here - as well as finish if msg == MAN_SIGNAL_REQ_RESEND: - logger.debug("Worker {} re-sending to Manager with status {}".format(workerID, worker.calc_status)) + logger.debug("Worker {} re-sending to Manager with status {}".\ + format(workerID, worker.calc_status)) comm.send(obj=worker_out, dest=0) continue if msg == MAN_SIGNAL_REQ_PICKLE_DUMP: - # Worker is requested to dump pickle file (either for read by manager or for debugging) + # Worker is requested to dump pickle file + # (either for read by manager or for debugging) import pickle - pfilename = "pickled_worker_{}_sim_{}.pkl".format(workerID, calc_iter[EVAL_SIM_TAG]) + pfilename = "pickled_worker_{}_sim_{}.pkl".\ + format(workerID, calc_iter[EVAL_SIM_TAG]) with open(pfilename, "wb") as f: pickle.dump(worker_out, f) with open(pfilename, "rb") as f: @@ -110,14 +116,16 @@ def worker_main(c, sim_specs, gen_specs): Work = msg libE_info = Work['libE_info'] - calc_type = Work['tag'] #If send components - send tag separately (dont use MPI.status!) + calc_type = Work['tag'] # Wend tag separately (dont use MPI.status!) calc_iter[calc_type] += 1 calc_in = (comm.recv(source=0) if len(libE_info['H_rows']) > 0 else np.zeros(0, dtype=dtypes[calc_type])) - logger.debug("Worker {} received calc_in of len {}".format(workerID, np.size(calc_in))) + logger.debug("Worker {} received calc_in of len {}". \ + format(workerID, np.size(calc_in))) - #This is current kluge for persistent worker - comm will be in the future comms module... + #This is current kludge for persistent worker - + #comm will be in the future comms module... if libE_info.get('persistent'): libE_info['comm'] = comm Work['libE_info'] = libE_info @@ -139,7 +147,8 @@ def worker_main(c, sim_specs, gen_specs): 'calc_status': worker.calc_status, 'calc_type': worker.calc_type} - logger.debug("Worker {} sending to Manager with status {}".format(workerID, worker.calc_status)) + logger.debug("Worker {} sending to Manager with status {}". \ + format(workerID, worker.calc_status)) comm.send(obj=worker_out, dest=0) #blocking if sim_specs.get('clean_jobs'): @@ -199,11 +208,13 @@ def _make_runners(sim_specs, gen_specs): def run_sim(calc_in, persis_info, libE_info): "Run a sim calculation" - return sim_specs['sim_f'](calc_in, persis_info, sim_specs, libE_info) + return sim_specs['sim_f'](calc_in, persis_info, + sim_specs, libE_info) def run_gen(calc_in, persis_info, libE_info): "Run a gen calculation" - return gen_specs['gen_f'](calc_in, persis_info, gen_specs, libE_info) + return gen_specs['gen_f'](calc_in, persis_info, + gen_specs, libE_info) return {EVAL_SIM_TAG: run_sim, EVAL_GEN_TAG: run_gen} @@ -214,7 +225,8 @@ def _set_job_controller(self): jobctl = JobController.controller jobctl.set_workerID(self.workerID) except Exception: - logger.info("No job_controller set on worker {}".format(self.workerID)) + logger.info("No job_controller set on worker {}".\ + format(self.workerID)) return False else: return True @@ -223,8 +235,8 @@ def _set_job_controller(self): def run(self, Work, calc_in): """Run a calculation on this worker object. - This routine calls the user calculations. Exceptions are caught, dumped to - the summary file, and raised. + This routine calls the user calculations. Exceptions are caught, + dumped to the summary file, and raised. Parameters ---------- @@ -233,8 +245,8 @@ def run(self, Work, calc_in): :ref:`(example)` calc_in: obj: numpy structured array - Rows from the :ref:`history array` for processing - + Rows from the :ref:`history array` + for processing """ assert Work['tag'] in [EVAL_SIM_TAG, EVAL_GEN_TAG], \ "calc_type must either be EVAL_SIM_TAG or EVAL_GEN_TAG" @@ -253,10 +265,15 @@ def run(self, Work, calc_in): calc_stats.calc_type = self.calc_type try: + calc = self._run_calc[self.calc_type] with self.loc_stack.loc(self.calc_type): - out = self._run_calc[self.calc_type](calc_in, self.persis_info, self.libE_info) - assert isinstance(out, tuple), "Calculation output must be a tuple. Worker exiting" - assert len(out) >= 2, "Calculation output must be at least two elements when a tuple" + out = calc(calc_in, self.persis_info, self.libE_info) + + assert isinstance(out, tuple), \ + "Calculation output must be a tuple." + assert len(out) >= 2, \ + "Calculation output must be at least two elements." + self.calc_out = out[0] self.persis_info = out[1] self.calc_status = out[2] if len(out) >= 3 else UNSET_TAG From bb69b57d17134107839e2ad0e9deda2830c84fbb Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 15:04:08 -0500 Subject: [PATCH 010/101] Changing attributes to local vars. --- libensemble/libE_worker.py | 41 +++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index d7778b489..6f530bdd5 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -110,7 +110,8 @@ def worker_main(c, sim_specs, gen_specs): pickle.dump(worker_out, f) with open(pfilename, "rb") as f: pickle.load(f) #check can read in this side - logger.debug("Worker {} dumping pickle and notifying manager: status {}".format(workerID, worker.calc_status)) + logger.debug("Worker {} dumping pickle and notifying manager: " + "status {}".format(workerID, worker.calc_status)) comm.send(obj=pfilename, dest=0) continue @@ -132,8 +133,8 @@ def worker_main(c, sim_specs, gen_specs): worker.run(Work, calc_in) - if worker.libE_info.get('persistent'): - del worker.libE_info['comm'] + if libE_info.get('persistent'): + del libE_info['comm'] #Check if sim/gen func recieved a finish signal... #Currently this means do not send data back first @@ -143,9 +144,9 @@ def worker_main(c, sim_specs, gen_specs): # Determine data to be returned to manager worker_out = {'calc_out': worker.calc_out, 'persis_info': worker.persis_info, - 'libE_info': worker.libE_info, + 'libE_info': libE_info, 'calc_status': worker.calc_status, - 'calc_type': worker.calc_type} + 'calc_type': calc_type} logger.debug("Worker {} sending to Manager with status {}". \ format(workerID, worker.calc_status)) @@ -178,25 +179,24 @@ def __init__(self, workerID, sim_specs, gen_specs): self.workerID = workerID self.calc_out = {} - self.calc_type = None self.calc_status = UNSET_TAG #From message_numbers self.calc_list = [] self.persis_info = None - self.libE_info = None self._run_calc = Worker._make_runners(sim_specs, gen_specs) - self.loc_stack = self._make_sim_worker_dir(sim_specs) - self._set_job_controller() + self.loc_stack = Worker._make_sim_worker_dir(sim_specs, workerID) + Worker._set_job_controller(workerID) - def _make_sim_worker_dir(self, sim_specs, locs=None): + @staticmethod + def _make_sim_worker_dir(sim_specs, workerID, locs=None): "Create a dir for sim workers if 'sim_dir' is in sim_specs" locs = locs or LocationStack() if 'sim_dir' in sim_specs: sim_dir = sim_specs['sim_dir'] prefix = sim_specs.get('sim_dir_prefix') - worker_dir = "{}_{}".format(sim_dir, self.workerID) + worker_dir = "{}_{}".format(sim_dir, workerID) locs.register_loc(EVAL_SIM_TAG, worker_dir, prefix=prefix, srcdir=sim_dir) return locs @@ -219,14 +219,15 @@ def run_gen(calc_in, persis_info, libE_info): return {EVAL_SIM_TAG: run_sim, EVAL_GEN_TAG: run_gen} - def _set_job_controller(self): + @staticmethod + def _set_job_controller(workerID): "Optional -- set worker ID in the job controller, return if set" try: jobctl = JobController.controller - jobctl.set_workerID(self.workerID) + jobctl.set_workerID(workerID) except Exception: logger.info("No job_controller set on worker {}".\ - format(self.workerID)) + format(workerID)) return False else: return True @@ -259,15 +260,15 @@ def run(self, Work, calc_in): calc_stats.start_timer() #Could keep all this inside the Work dictionary if sending all Work ... - self.libE_info = Work['libE_info'] - self.calc_type = Work['tag'] self.persis_info = Work['persis_info'] - calc_stats.calc_type = self.calc_type + libE_info = Work['libE_info'] + calc_type = Work['tag'] + calc_stats.calc_type = calc_type try: - calc = self._run_calc[self.calc_type] - with self.loc_stack.loc(self.calc_type): - out = calc(calc_in, self.persis_info, self.libE_info) + calc = self._run_calc[calc_type] + with self.loc_stack.loc(calc_type): + out = calc(calc_in, self.persis_info, libE_info) assert isinstance(out, tuple), \ "Calculation output must be a tuple." From b7189e9142c14819b9b93a3ec54765dcf7bc0251 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 18:23:20 -0500 Subject: [PATCH 011/101] Cleaning up in libE_worker. --- libensemble/libE_worker.py | 144 ++++++++++++++++--------------------- 1 file changed, 62 insertions(+), 82 deletions(-) diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index 6f530bdd5..5cfdb87e0 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -8,8 +8,9 @@ import socket import logging -import numpy as np +from itertools import count +import numpy as np from mpi4py import MPI from libensemble.message_numbers import \ @@ -40,6 +41,24 @@ def recv_dtypes(comm): return dtypes +def recv_from_manager(comm): + """Receive a tagged message from the manager.""" + status = MPI.Status() + msg = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) + mtag = status.Get_tag() + return mtag, msg + + +def dump_pickle(pfilename, worker_out): + """Write a pickle of the message.""" + import pickle + with open(pfilename, "wb") as f: + pickle.dump(worker_out, f) + with open(pfilename, "rb") as f: + pickle.load(f) #check can read in this side + return pfilename + + #The routine worker_main currently uses MPI. #Comms will be implemented using comms module in future def worker_main(c, sim_specs, gen_specs): @@ -61,96 +80,70 @@ def worker_main(c, sim_specs, gen_specs): """ comm = c['comm'] - - rank = comm.Get_rank() - workerID = rank - - status = MPI.Status() dtypes = recv_dtypes(comm) - - worker = Worker(workerID, sim_specs, gen_specs) + worker = Worker(comm.Get_rank(), sim_specs, gen_specs) #Setup logging - logger.info("Worker {} initiated on MPI rank {} on node {}". \ - format(workerID, rank, socket.gethostname())) + logger.info("Worker initiated on MPI rank {} on node {}". \ + format(comm.Get_rank(), socket.gethostname())) # Print calc_list on-the-fly CalcInfo.create_worker_statfile(worker.workerID) - worker_iter = 0 - calc_iter = {EVAL_GEN_TAG : 0, EVAL_SIM_TAG : 0} - #Init in case of manager request before filled - worker_out = {} + worker_out = {'calc_status': UNSET_TAG} + calc_iter = {EVAL_SIM_TAG : 0, EVAL_GEN_TAG : 0} + + for worker_iter in count(start=1): + logger.debug("Iteration {}".format(worker_iter)) - while True: - worker_iter += 1 - logger.debug("Worker {}. Iteration {}".format(workerID, worker_iter)) + mtag, Work = recv_from_manager(comm) + if mtag == STOP_TAG: - # Receive message from worker - msg = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) - mtag = status.Get_tag() - if mtag == STOP_TAG: # Change this to MANAGER_SIGNAL_TAG or something - if msg == MAN_SIGNAL_FINISH: #shutdown the worker + if Work == MAN_SIGNAL_FINISH: #shutdown the worker break #Need to handle manager job kill here - as well as finish - if msg == MAN_SIGNAL_REQ_RESEND: - logger.debug("Worker {} re-sending to Manager with status {}".\ - format(workerID, worker.calc_status)) + if Work == MAN_SIGNAL_REQ_RESEND: + logger.debug("Re-sending to Manager with status {}".\ + format(worker_out['calc_status'])) comm.send(obj=worker_out, dest=0) continue - if msg == MAN_SIGNAL_REQ_PICKLE_DUMP: - # Worker is requested to dump pickle file - # (either for read by manager or for debugging) - import pickle + if Work == MAN_SIGNAL_REQ_PICKLE_DUMP: pfilename = "pickled_worker_{}_sim_{}.pkl".\ - format(workerID, calc_iter[EVAL_SIM_TAG]) - with open(pfilename, "wb") as f: - pickle.dump(worker_out, f) - with open(pfilename, "rb") as f: - pickle.load(f) #check can read in this side - logger.debug("Worker {} dumping pickle and notifying manager: " - "status {}".format(workerID, worker.calc_status)) - comm.send(obj=pfilename, dest=0) + format(worker.workerID, calc_iter[EVAL_SIM_TAG]) + logger.debug("Make pickle for manager: status {}".\ + format(worker_out['calc_status'])) + comm.send(obj=dump_pickle(pfilename, worker_out), dest=0) continue - Work = msg libE_info = Work['libE_info'] - calc_type = Work['tag'] # Wend tag separately (dont use MPI.status!) + calc_type = Work['tag'] # Send tag separately (dont use MPI.status!) calc_iter[calc_type] += 1 calc_in = (comm.recv(source=0) if len(libE_info['H_rows']) > 0 else np.zeros(0, dtype=dtypes[calc_type])) - logger.debug("Worker {} received calc_in of len {}". \ - format(workerID, np.size(calc_in))) + logger.debug("Received calc_in of len {}".format(np.size(calc_in))) - #This is current kludge for persistent worker - #comm will be in the future comms module... if libE_info.get('persistent'): libE_info['comm'] = comm - Work['libE_info'] = libE_info + calc_out, persis_info, calc_status = worker.run(Work, calc_in) + libE_info.pop('comm', None) - worker.run(Work, calc_in) - - if libE_info.get('persistent'): - del libE_info['comm'] + worker_out = {'calc_out': calc_out, + 'persis_info': persis_info, + 'libE_info': libE_info, + 'calc_status': calc_status, + 'calc_type': calc_type} #Check if sim/gen func recieved a finish signal... #Currently this means do not send data back first - if worker.calc_status == MAN_SIGNAL_FINISH: + if calc_status == MAN_SIGNAL_FINISH: break - # Determine data to be returned to manager - worker_out = {'calc_out': worker.calc_out, - 'persis_info': worker.persis_info, - 'libE_info': libE_info, - 'calc_status': worker.calc_status, - 'calc_type': calc_type} - - logger.debug("Worker {} sending to Manager with status {}". \ - format(workerID, worker.calc_status)) - comm.send(obj=worker_out, dest=0) #blocking + logger.debug("Sending to Manager with status {}".format(calc_status)) + comm.send(obj=worker_out, dest=0) if sim_specs.get('clean_jobs'): worker.clean() @@ -177,13 +170,6 @@ def __init__(self, workerID, sim_specs, gen_specs): """ self.workerID = workerID - - self.calc_out = {} - self.calc_status = UNSET_TAG #From message_numbers - self.calc_list = [] - - self.persis_info = None - self._run_calc = Worker._make_runners(sim_specs, gen_specs) self.loc_stack = Worker._make_sim_worker_dir(sim_specs, workerID) Worker._set_job_controller(workerID) @@ -206,15 +192,14 @@ def _make_sim_worker_dir(sim_specs, workerID, locs=None): def _make_runners(sim_specs, gen_specs): "Create functions to run a sim or gen" + sim_f = sim_specs['sim_f'] + gen_f = sim_specs['gen_f'] + def run_sim(calc_in, persis_info, libE_info): - "Run a sim calculation" - return sim_specs['sim_f'](calc_in, persis_info, - sim_specs, libE_info) + return sim_f(calc_in, persis_info, sim_specs, libE_info) def run_gen(calc_in, persis_info, libE_info): - "Run a gen calculation" - return gen_specs['gen_f'](calc_in, persis_info, - gen_specs, libE_info) + return gen_f(calc_in, persis_info, gen_specs, libE_info) return {EVAL_SIM_TAG: run_sim, EVAL_GEN_TAG: run_gen} @@ -254,37 +239,32 @@ def run(self, Work, calc_in): # calc_stats stores timing and summary info for this Calc (sim or gen) calc_stats = CalcInfo() - self.calc_list.append(calc_stats) #Timing will include setup/teardown calc_stats.start_timer() #Could keep all this inside the Work dictionary if sending all Work ... - self.persis_info = Work['persis_info'] - libE_info = Work['libE_info'] calc_type = Work['tag'] calc_stats.calc_type = calc_type try: calc = self._run_calc[calc_type] with self.loc_stack.loc(calc_type): - out = calc(calc_in, self.persis_info, libE_info) + out = calc(calc_in, Work['persis_info'], Work['libE_info']) assert isinstance(out, tuple), \ "Calculation output must be a tuple." assert len(out) >= 2, \ "Calculation output must be at least two elements." - self.calc_out = out[0] - self.persis_info = out[1] - self.calc_status = out[2] if len(out) >= 3 else UNSET_TAG - except Exception as e: - self.calc_out = {} - self.calc_status = CALC_EXCEPTION + calc_status = out[2] if len(out) >= 3 else UNSET_TAG + return out[0], out[1], calc_status + except Exception: + calc_status = CALC_EXCEPTION raise finally: calc_stats.stop_timer() - calc_stats.set_calc_status(self.calc_status) + calc_stats.set_calc_status(calc_status) CalcInfo.add_calc_worker_statfile(calc=calc_stats) From 9120a11b2c181f98b25e702c6077924b1dce3b94 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 18:50:00 -0500 Subject: [PATCH 012/101] Further moving about in libE_worker. --- libensemble/libE_worker.py | 73 ++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index 5cfdb87e0..97f68d4bf 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -59,6 +59,29 @@ def dump_pickle(pfilename, worker_out): return pfilename +def receive_and_run(comm, dtypes, worker, Work): + """Receive data associated with a work order and run calc.""" + + libE_info = Work['libE_info'] + calc_type = Work['tag'] + + calc_in = (comm.recv(source=0) if len(libE_info['H_rows']) > 0 + else np.zeros(0, dtype=dtypes[calc_type])) + logger.debug("Received calc_in of len {}".format(np.size(calc_in))) + + #comm will be in the future comms module... + if libE_info.get('persistent'): + libE_info['comm'] = comm + calc_out, persis_info, calc_status = worker.run(Work, calc_in) + libE_info.pop('comm', None) + + return {'calc_out': calc_out, + 'persis_info': persis_info, + 'libE_info': libE_info, + 'calc_status': calc_status, + 'calc_type': calc_type} + + #The routine worker_main currently uses MPI. #Comms will be implemented using comms module in future def worker_main(c, sim_specs, gen_specs): @@ -92,7 +115,6 @@ def worker_main(c, sim_specs, gen_specs): #Init in case of manager request before filled worker_out = {'calc_status': UNSET_TAG} - calc_iter = {EVAL_SIM_TAG : 0, EVAL_GEN_TAG : 0} for worker_iter in count(start=1): logger.debug("Iteration {}".format(worker_iter)) @@ -111,38 +133,20 @@ def worker_main(c, sim_specs, gen_specs): if Work == MAN_SIGNAL_REQ_PICKLE_DUMP: pfilename = "pickled_worker_{}_sim_{}.pkl".\ - format(worker.workerID, calc_iter[EVAL_SIM_TAG]) + format(worker.workerID, worker.calc_iter[EVAL_SIM_TAG]) logger.debug("Make pickle for manager: status {}".\ format(worker_out['calc_status'])) comm.send(obj=dump_pickle(pfilename, worker_out), dest=0) continue - libE_info = Work['libE_info'] - calc_type = Work['tag'] # Send tag separately (dont use MPI.status!) - calc_iter[calc_type] += 1 - - calc_in = (comm.recv(source=0) if len(libE_info['H_rows']) > 0 - else np.zeros(0, dtype=dtypes[calc_type])) - logger.debug("Received calc_in of len {}".format(np.size(calc_in))) - - #comm will be in the future comms module... - if libE_info.get('persistent'): - libE_info['comm'] = comm - calc_out, persis_info, calc_status = worker.run(Work, calc_in) - libE_info.pop('comm', None) - - worker_out = {'calc_out': calc_out, - 'persis_info': persis_info, - 'libE_info': libE_info, - 'calc_status': calc_status, - 'calc_type': calc_type} - - #Check if sim/gen func recieved a finish signal... - #Currently this means do not send data back first - if calc_status == MAN_SIGNAL_FINISH: + worker_out = receive_and_run(comm, dtypes, worker, Work) + + # Check whether worker exited because it polled a manager signal + if worker_out['calc_status'] == MAN_SIGNAL_FINISH: break - logger.debug("Sending to Manager with status {}".format(calc_status)) + logger.debug("Sending to Manager with status {}".\ + format(worker_out['calc_status'])) comm.send(obj=worker_out, dest=0) if sim_specs.get('clean_jobs'): @@ -168,10 +172,10 @@ def __init__(self, workerID, sim_specs, gen_specs): The ID for this worker """ - self.workerID = workerID - self._run_calc = Worker._make_runners(sim_specs, gen_specs) + self.calc_iter = {EVAL_SIM_TAG : 0, EVAL_GEN_TAG : 0} self.loc_stack = Worker._make_sim_worker_dir(sim_specs, workerID) + self._run_calc = Worker._make_runners(sim_specs, gen_specs) Worker._set_job_controller(workerID) @@ -193,12 +197,14 @@ def _make_runners(sim_specs, gen_specs): "Create functions to run a sim or gen" sim_f = sim_specs['sim_f'] - gen_f = sim_specs['gen_f'] + gen_f = gen_specs['gen_f'] def run_sim(calc_in, persis_info, libE_info): + "Call the sim func." return sim_f(calc_in, persis_info, sim_specs, libE_info) def run_gen(calc_in, persis_info, libE_info): + "Call the gen func." return gen_f(calc_in, persis_info, gen_specs, libE_info) return {EVAL_SIM_TAG: run_sim, EVAL_GEN_TAG: run_gen} @@ -234,17 +240,14 @@ def run(self, Work, calc_in): Rows from the :ref:`history array` for processing """ - assert Work['tag'] in [EVAL_SIM_TAG, EVAL_GEN_TAG], \ + calc_type = Work['tag'] + self.calc_iter[calc_type] += 1 + assert calc_type in [EVAL_SIM_TAG, EVAL_GEN_TAG], \ "calc_type must either be EVAL_SIM_TAG or EVAL_GEN_TAG" # calc_stats stores timing and summary info for this Calc (sim or gen) calc_stats = CalcInfo() - - #Timing will include setup/teardown calc_stats.start_timer() - - #Could keep all this inside the Work dictionary if sending all Work ... - calc_type = Work['tag'] calc_stats.calc_type = calc_type try: From 7c5579b69148558e7d1d3ef86107b3960df85e19 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 22:27:40 -0500 Subject: [PATCH 013/101] Moved message number strings into message_numbers.py --- libensemble/calc_info.py | 73 ++++++++++++++-------------------- libensemble/message_numbers.py | 32 ++++++++++++--- 2 files changed, 57 insertions(+), 48 deletions(-) diff --git a/libensemble/calc_info.py b/libensemble/calc_info.py index 577c82bed..b944ce56c 100644 --- a/libensemble/calc_info.py +++ b/libensemble/calc_info.py @@ -9,36 +9,28 @@ import itertools import os -from libensemble.message_numbers import EVAL_SIM_TAG, EVAL_GEN_TAG - -#Todo: Move calc_status tags here - and make manager signals diff. This will then -#need to be accessed by sim_func...Currently get from message_numbers -from libensemble.message_numbers import WORKER_KILL -from libensemble.message_numbers import WORKER_KILL_ON_ERR -from libensemble.message_numbers import WORKER_KILL_ON_TIMEOUT -from libensemble.message_numbers import JOB_FAILED -from libensemble.message_numbers import WORKER_DONE -from libensemble.message_numbers import MAN_SIGNAL_FINISH -from libensemble.message_numbers import MAN_SIGNAL_KILL -from libensemble.message_numbers import CALC_EXCEPTION +from libensemble.message_numbers import calc_type_strings, calc_status_strings class CalcInfo(): """A class to store and manage statistics for each calculation. - An object of this class represents the statistics for a given calculation. + An object of this class represents the statistics for a given + calculation. **Class Attributes:** :cvar string stat_file: - A class attribute holding the name of the global summary file (default: 'libe_summary.txt') + A class attribute holding the name of the global summary file + (default: 'libe_summary.txt') :cvar string worker_statfile: A class attribute holding the name of the current workers summary file - (default: Initially None, but is set to .w when the file is created) + (default: Initially None, but is set to .w + when the file is created) :cvar boolean keep_worker_stat_files: - A class attribute determining whether worker stat files are kept after merging - to global summary file (default: False). + A class attribute determining whether worker stat files are kept + after merging to global summary file (default: False). **Object Attributes:** @@ -48,7 +40,8 @@ class CalcInfo(): :ivar string date_end: Calculation end date :ivar int calc_type: Type flag:EVAL_SIM_TAG/EVAL_GEN_TAG :ivar int id: Auto-generated ID for this calc (unique within Worker) - :ivar string status: "Description of the status of this calc" + :ivar string status: "Description of the status of this calc +" """ newid = itertools.count() @@ -56,24 +49,6 @@ class CalcInfo(): worker_statfile = None keep_worker_stat_files = False - calc_type_strings = { - EVAL_SIM_TAG: 'sim', - EVAL_GEN_TAG: 'gen', - None: 'No type set' - } - - calc_status_strings = { - MAN_SIGNAL_FINISH: "Manager killed on finish", - MAN_SIGNAL_KILL: "Manager killed job", - WORKER_KILL_ON_ERR: " Worker killed job on Error", - WORKER_KILL_ON_TIMEOUT: "Worker killed job on Timeout", - WORKER_KILL: "Worker killed", - JOB_FAILED: "Job Failed", - WORKER_DONE: "Completed", - CALC_EXCEPTION: "Exception occurred", - None: "Unknown Status" - } - @staticmethod def set_statfile_name(name): """Change the name ofr the statistics file""" @@ -86,8 +61,15 @@ def smart_sort(l): For example: Worker10 comes after Worker9. No padding required """ import re - convert = lambda text: int(text) if text.isdigit() else text - alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] + + def convert(text): + "Convert number strings to numbers, leave other strings alone." + return int(text) if text.isdigit() else text + + def alphanum_key(key): + "Split string into list of substrings and numbers for sort." + return [convert(c) for c in re.split('([0-9]+)', key)] + return sorted(l, key=alphanum_key) @staticmethod @@ -137,7 +119,8 @@ def start_timer(self): self.date_start = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") def stop_timer(self): - """Stop the timer and record datestamp (normally for a calculation) and set total run time""" + """Stop the timer and record datestamp (normally for a + calculation) and set total run time""" self.end = time.time() self.date_end = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") #increment so can start and stop repeatedly @@ -155,14 +138,18 @@ def print_calc(self, fileH): File to print calc statistics to. """ - fileH.write(" Calc %d: %s Time: %.2f Start: %s End: %s Status: %s\n" % (self.id, self.get_type(), self.time, self.date_start, self.date_end, self.status)) + fileH.write(" Calc %d: %s Time: %.2f Start: %s End: %s Status: %s\n" % + (self.id, self.get_type(), self.time, + self.date_start, self.date_end, self.status)) def get_type(self): """Returns the calculation type as a string. - Converts self.calc_type to string. self.calc_type should have been set by the worker""" - return CalcInfo.calc_type_strings.get(self.calc_type, "Unknown type") + Converts self.calc_type to string. self.calc_type should have + been set by the worker + """ + return calc_type_strings.get(self.calc_type, "Unknown type") def set_calc_status(self, calc_status_flag): @@ -175,4 +162,4 @@ def set_calc_status(self, calc_status_flag): """ #For now assuming if not got an error - it was ok - self.status = CalcInfo.calc_status_strings.get(calc_status_flag, "Completed") + self.status = calc_status_strings.get(calc_status_flag, "Completed") diff --git a/libensemble/message_numbers.py b/libensemble/message_numbers.py index 822191335..e8ec981ce 100644 --- a/libensemble/message_numbers.py +++ b/libensemble/message_numbers.py @@ -1,12 +1,22 @@ +# --- Tags + UNSET_TAG = 0 #sh temp - this is a libe feature that is to be reviewed for best solution EVAL_SIM_TAG = 1 EVAL_GEN_TAG = 2 STOP_TAG = 3 -PERSIS_STOP = 4 # manager tells a persistent worker to stop (and go back into general worker mode) +PERSIS_STOP = 4 # manager tells persistent worker to desist +FINISHED_PERSISTENT_SIM_TAG = 11 # tells manager sim_f done persistent mode +FINISHED_PERSISTENT_GEN_TAG = 12 # tells manager gen_f done persistent mode +ABORT_ENSEMBLE = 13 # Worker asks manager to abort (and dump history) + +calc_type_strings = { + EVAL_SIM_TAG: 'sim', + EVAL_GEN_TAG: 'gen', + None: 'No type set' +} -FINISHED_PERSISTENT_SIM_TAG = 11 # tells manager sim_f is done with persistent mode -FINISHED_PERSISTENT_GEN_TAG = 12 # tells manager gen_f is done with persistent mode -ABORT_ENSEMBLE = 13 # Worker sends to manager to tell to abort (and dump history) + +# --- Signal flags (in message body vs tags) # CALC STATUS/SIGNAL FLAGS: In future these will be in a data structure MAN_SIGNAL_FINISH = 20 # Kill jobs and shutdown worker @@ -14,9 +24,21 @@ MAN_SIGNAL_REQ_RESEND = 22 # Request worker to resend message MAN_SIGNAL_REQ_PICKLE_DUMP = 23 # Request worker to dump pickled file of message -WORKER_KILL = 30 #Currently for worker kills that are not covered by more specific. In future will allow user description +WORKER_KILL = 30 # Worker kills not covered by a more specific case WORKER_KILL_ON_ERR = 31 WORKER_KILL_ON_TIMEOUT = 32 JOB_FAILED = 33 WORKER_DONE = 34 CALC_EXCEPTION = 35 + +calc_status_strings = { + MAN_SIGNAL_FINISH: "Manager killed on finish", + MAN_SIGNAL_KILL: "Manager killed job", + WORKER_KILL_ON_ERR: " Worker killed job on Error", + WORKER_KILL_ON_TIMEOUT: "Worker killed job on Timeout", + WORKER_KILL: "Worker killed", + JOB_FAILED: "Job Failed", + WORKER_DONE: "Completed", + CALC_EXCEPTION: "Exception occurred", + None: "Unknown Status" +} From c027e049c7f86fa9bd4130ae07dcdfcbc8f83782 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 23:15:43 -0500 Subject: [PATCH 014/101] Added timer object and test harness. --- libensemble/tests/unit_tests/test_timer.py | 43 ++++++++++++ libensemble/timer.py | 79 ++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 libensemble/tests/unit_tests/test_timer.py create mode 100644 libensemble/timer.py diff --git a/libensemble/tests/unit_tests/test_timer.py b/libensemble/tests/unit_tests/test_timer.py new file mode 100644 index 000000000..b76d311b1 --- /dev/null +++ b/libensemble/tests/unit_tests/test_timer.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +""" +Unit test of timers for libensemble. +""" + +import time +from libensemble.timer import Timer + +def test_timer(): + "Test timer." + + timer = Timer() + + with timer: + time.sleep(0.5) + e1 = timer.elapsed + + e2 = timer.elapsed + time.sleep(0.1) + e3 = timer.elapsed + + assert (e1 >= 0.5) and (e1 <= 0.6), "Check timed sleep seems correct" + assert e2 >= e1, "Check timer order." + assert e2 == e3, "Check elapsed time stable when timer inactive." + + s1 = timer.date_start + s2 = timer.date_end + assert s1[0:2] == "20", "Start year is 20xx" + assert s2[0:2] == "20", "End year is 20xx" + + s3 = "{}".format(timer) + assert s3 == "Time: {0:.2f} Start: {1} End: {2}".format(e3, s1, s2), \ + "Check string formatting." + + with timer: + time.sleep(0.5) + total1 = timer.total + + assert total1 >= 1 and total1 <= 1.1, \ + "Check cumulative timing (active)." + assert timer.total >= 1 and timer.total <= 1.1, \ + "Check cumulative timing (not active)." diff --git a/libensemble/timer.py b/libensemble/timer.py new file mode 100644 index 000000000..3e9f79d43 --- /dev/null +++ b/libensemble/timer.py @@ -0,0 +1,79 @@ +""" +libensemble utility class -- manages timer +""" + +import time + +class Timer: + """Timer class used in libensemble. + + Attributes + ---------- + + tcum: float: + Total time recorded by timer. + + tstart: float: + Most recent starting time. + + tend: float: + Most recent ending time. + + timing: bool: + Indicates whether the timer is currently active. + """ + + def __init__(self): + """Initialize a new timer.""" + self.tcum = 0.0 + self.tstart = 0.0 + self.tend = 0.0 + self.timing = False + + def __str__(self): + """Return a string representation of the timer.""" + return ("Time: {0:.2f} Start: {1} End: {2}". + format(self.total, self.date_start, self.date_end)) + + @property + def date_start(self): + """Return a string representing the start datetime.""" + return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.tstart)) + + @property + def date_end(self): + """Return a string representing the end datetime.""" + return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.tend)) + + @property + def elapsed(self): + """Return time since last start (active) or in most recent interval.""" + etime = self.tend if not self.timing else time.time() + return etime-self.tstart + + @property + def total(self): + """Return the total time since last start.""" + if self.timing: + return self.tcum + self.elapsed + return self.tcum + + def start(self): + """Start the timer.""" + self.tstart = time.time() + self.timing = True + + def stop(self): + """Stop the timer.""" + self.tend = time.time() + self.timing = False + self.tcum += (self.tend-self.tstart) + + def __enter__(self): + """Enter a timing context.""" + self.start() + return self + + def __exit__(self, etype, value, traceback): + """Exit a timing context.""" + self.stop() From ff055d0ade261987cda4d9d2a6120f6c1c4c0574 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 28 Aug 2018 23:31:11 -0500 Subject: [PATCH 015/101] Switched manager/worker timing to use timer class --- libensemble/calc_info.py | 28 ++++------------------------ libensemble/libE_manager.py | 15 ++++----------- libensemble/libE_worker.py | 7 +++---- 3 files changed, 11 insertions(+), 39 deletions(-) diff --git a/libensemble/calc_info.py b/libensemble/calc_info.py index b944ce56c..7e2b119d6 100644 --- a/libensemble/calc_info.py +++ b/libensemble/calc_info.py @@ -4,11 +4,10 @@ This includes creating the statistics (or calc summary) file. """ -import time -import datetime import itertools import os +from libensemble.timer import Timer from libensemble.message_numbers import calc_type_strings, calc_status_strings class CalcInfo(): @@ -104,28 +103,11 @@ def __init__(self): A new CalcInfo object is created for each calculation. """ - self.time = 0.0 - self.start = 0.0 - self.end = 0.0 - self.date_start = None - self.date_end = None + self.timer = Timer() self.calc_type = None self.id = next(CalcInfo.newid) self.status = "Not complete" - def start_timer(self): - """Start the timer and record datestamp (normally for a calculation)""" - self.start = time.time() - self.date_start = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") - - def stop_timer(self): - """Stop the timer and record datestamp (normally for a - calculation) and set total run time""" - self.end = time.time() - self.date_end = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") - #increment so can start and stop repeatedly - self.time += self.end - self.start - def print_calc(self, fileH): """Print a calculation summary. @@ -138,10 +120,8 @@ def print_calc(self, fileH): File to print calc statistics to. """ - fileH.write(" Calc %d: %s Time: %.2f Start: %s End: %s Status: %s\n" % - (self.id, self.get_type(), self.time, - self.date_start, self.date_end, self.status)) - + fileH.write(" Calc {}: {} {} Status: {}\n". + format(self.id, self.get_type(), self.timer, self.status)) def get_type(self): """Returns the calculation type as a string. diff --git a/libensemble/libE_manager.py b/libensemble/libE_manager.py index 9907df322..babc09775 100644 --- a/libensemble/libE_manager.py +++ b/libensemble/libE_manager.py @@ -6,7 +6,6 @@ from __future__ import division from __future__ import absolute_import -import time import sys import os import logging @@ -16,6 +15,7 @@ from mpi4py import MPI import numpy as np +from libensemble.timer import Timer from libensemble.message_numbers import \ EVAL_SIM_TAG, FINISHED_PERSISTENT_SIM_TAG, \ EVAL_GEN_TAG, FINISHED_PERSISTENT_GEN_TAG, \ @@ -42,15 +42,6 @@ def manager_main(hist, libE_specs, alloc_specs, return mgr.run(persis_info) -def get_stopwatch(): - "Return an elapsed time function, starting now" - start_time = time.time() - def elapsed(): - "Return time elapsed since start." - return time.time()-start_time - return elapsed - - def filter_nans(array): "Filter out NaNs from a numpy array." return array[~np.isnan(array)] @@ -67,13 +58,15 @@ class Manager: def __init__(self, hist, libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria): """Initialize the manager.""" + timer = Timer() + timer.start() self.hist = hist self.libE_specs = libE_specs self.alloc_specs = alloc_specs self.sim_specs = sim_specs self.gen_specs = gen_specs self.exit_criteria = exit_criteria - self.elapsed = get_stopwatch() + self.elapsed = lambda: timer.elapsed self.comm = libE_specs['comm'] self.W = self._make_worker_pool(self.comm) self.term_tests = \ diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index 97f68d4bf..ea8cc9d6c 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -247,13 +247,13 @@ def run(self, Work, calc_in): # calc_stats stores timing and summary info for this Calc (sim or gen) calc_stats = CalcInfo() - calc_stats.start_timer() calc_stats.calc_type = calc_type try: calc = self._run_calc[calc_type] - with self.loc_stack.loc(calc_type): - out = calc(calc_in, Work['persis_info'], Work['libE_info']) + with calc_stats.timer: + with self.loc_stack.loc(calc_type): + out = calc(calc_in, Work['persis_info'], Work['libE_info']) assert isinstance(out, tuple), \ "Calculation output must be a tuple." @@ -266,7 +266,6 @@ def run(self, Work, calc_in): calc_status = CALC_EXCEPTION raise finally: - calc_stats.stop_timer() calc_stats.set_calc_status(calc_status) CalcInfo.add_calc_worker_statfile(calc=calc_stats) From 9c352f309a032be6f37409345ebb547ccaf56b14 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Fri, 31 Aug 2018 10:22:25 -0500 Subject: [PATCH 016/101] Add "default_app" getter to registry. --- libensemble/register.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/libensemble/register.py b/libensemble/register.py index 8ba6eb1dc..bd026420c 100644 --- a/libensemble/register.py +++ b/libensemble/register.py @@ -52,6 +52,11 @@ def gen_default_app(self): """Return the default generator app.""" return self._default_apps['gen'] + @property + def default_app(self, calc_type): + """Return the default calc_type app.""" + return self._default_apps.get(calc_type) + def __init__(self, default=True): '''Instantiate a new Register instance From daee6ad91ce54b5117e17be013b9059aa94031e0 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Wed, 5 Sep 2018 16:04:13 -0500 Subject: [PATCH 017/101] Added launcher. --- libensemble/controller.py | 2 +- libensemble/launcher.py | 130 ++++++++++++++++++ libensemble/tests/unit_tests/launch_busy.py | 19 +++ libensemble/tests/unit_tests/test_launcher.py | 71 ++++++++++ 4 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 libensemble/launcher.py create mode 100644 libensemble/tests/unit_tests/launch_busy.py create mode 100644 libensemble/tests/unit_tests/test_launcher.py diff --git a/libensemble/controller.py b/libensemble/controller.py index c75067192..55ad55ca7 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -674,7 +674,7 @@ def kill(self, job): raise JobControllerException('Unknown kill signal') try: JobController._kill_process(job.process, sig[self.kill_signal]) - except ProcessLookupError: + except OSError: # In Python 3, ProcessLookupError logger.warning("Tried to kill job {}. Process {} not found. May have finished".format(job.name, job.process.pid)) # Wait for job to be killed diff --git a/libensemble/launcher.py b/libensemble/launcher.py new file mode 100644 index 000000000..8ad329e79 --- /dev/null +++ b/libensemble/launcher.py @@ -0,0 +1,130 @@ +""" +libensemble helpers for launching subprocesses. +==================================================== +""" + +import os +import sys +import shlex +import signal +import time + +from itertools import chain + + +# See: https://github.com/google/python-subprocess32 +if os.name == 'posix' and sys.version_info[0] < 3: + try: + import subprocess32 as subprocess + have_subprocess32 = True + except ImportError: + import subprocess + have_subprocess32 = False +else: + import subprocess + have_subprocess32 = (sys.version_info[0] >= 3 + and sys.version_info[1] >= 2) + + +def form_command(cmd_template, specs): + "Fill command parts with dict entries from specs; drop any missing." + specs = {k: v for k, v in specs.items() if v is not None} + def fill(fmt): + "Fill a template string and split with shlex; drop if missing specs" + try: + return shlex.split(fmt.format(**specs)) + except KeyError: + return None + return list(chain.from_iterable(filter(None, map(fill, cmd_template)))) + + +def launch(cmd_template, specs=None, **kwargs): + "Launch a new subprocess (with command templating and Python 3 help)." + if not have_subprocess32 and kwargs.get('start_new_session'): + del kwargs['start_new_session'] + kwargs['preexec_fn'] = os.setsid + cmd = (form_command(cmd_template, specs) if specs is not None + else cmd_template) + return subprocess.Popen(cmd, **kwargs) + + +def killpg(process): + "Kill the process (and group if it is group leader)." + try: + pid = process.pid + pgid = os.getpgid(pid) if hasattr(os, 'killpg') else -1 + if pgid == pid: + os.killpg(pgid, signal.SIGKILL) + else: + process.kill() + return True + except OSError: # In Python 3: ProcessLookupError + return False + + +def terminatepg(process): + "Send termination signal to the process (and group if it is group leader)" + try: + pid = process.pid + pgid = os.getpgid(pid) if hasattr(os, 'killpg') else -1 + if pgid == pid: + os.killpg(pgid, signal.SIGTERM) + elif hasattr(signal, 'CTRL_BREAK_EVENT'): + # Supposedly does a group terminate for Windows... + process.send_signal(signal.CTRL_BREAK_EVENT) + else: + process.terminate() + return True + except OSError: # In Python 3: ProcessLookupError + return False + + +def process_is_stopped(process, timeout): + "Wait for timeout to see if process is finished; True if done." + start_time = time.time() + while time.time() - start_time < timeout: + time.sleep(0.01) + if process.poll() is not None: + return True + return process.poll() is not None + + +if sys.version_info[0] < 3 or (sys.version_info[0] == 3 and sys.version_info[1] < 3): + + # Python 3.3 added timeout arguments + def wait(process, timeout=None): + "Wait on a process with timeout." + if timeout is None or process_is_stopped(process, timeout): + return process.wait() + return None + +else: + + def wait(process, timeout=None): + "Wait on a process with timeout (wait forever if None)." + try: + return process.wait(timeout=timeout) + except subprocess.TimeoutExpired: + return None + + +def wait_and_kill(process, timeout): + "Give a grace period for a process to terminate, then kill it." + rc = wait(process, timeout) + if rc is not None: + return rc + killpg(process) + return process.wait() + + +def cancel(process, timeout=0): + "Send a termination signal, give a grace period, then hard kill if needed." + if timeout is not None and timeout > 0: + terminatepg(process) + return wait_and_kill(process, timeout) + + +# Note: cancel with timeout 0 -- just kill and then wait +# cancel with timeout None -- just terminate and then wait +# cancel with timeout > 0 -- try terminating, then hard kill if needed + diff --git a/libensemble/tests/unit_tests/launch_busy.py b/libensemble/tests/unit_tests/launch_busy.py new file mode 100644 index 000000000..06c98a629 --- /dev/null +++ b/libensemble/tests/unit_tests/launch_busy.py @@ -0,0 +1,19 @@ +import time +import sys +import signal + +def ignore_handler(signum, frame): + print("Ignoring SIGTERM") + +def main(ignore_term=False, wait_time=-1): + print("Call with {}, {}".format(ignore_term, wait_time)) + if ignore_term: + signal.signal(signal.SIGTERM, ignore_handler) + if wait_time > 0: + time.sleep(wait_time) + else: + while True: + pass + +if __name__ == "__main__": + main(*[float(x) for x in sys.argv[1:]]) diff --git a/libensemble/tests/unit_tests/test_launcher.py b/libensemble/tests/unit_tests/test_launcher.py new file mode 100644 index 000000000..38b81ba51 --- /dev/null +++ b/libensemble/tests/unit_tests/test_launcher.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +""" +Unit test of launcher helpers for libensemble. +""" + +import sys +import libensemble.launcher as launcher + + +def test_form_command(): + "Test the command templating." + + run_specs = {'mpirun' : 'mpirun', + 'nproc' : 10, + 'nrank' : 5, + 'mf' : None} + + cmd = ['{mpirun}', '-n {nproc}', '-nper {nrank}', '-machinefile {mf}', + 'more arguments "ho hum"'] + args = launcher.form_command(cmd, run_specs) + aref = ['mpirun', '-n', '10', '-nper', '5', 'more', 'arguments', 'ho hum'] + + assert args == aref, "Command templating test failed." + + +def test_launch(): + "Test simple launch." + + py_exe = sys.executable or "python" + + # Launch infinite loop, pay attention to term + process = launcher.launch([py_exe, "launch_busy.py"]) + assert not launcher.process_is_stopped(process, 0.1), \ + "Process stopped early." + launcher.cancel(process, 0.5) + + # Launch infinite loop, ignore term + process = launcher.launch([py_exe, "launch_busy.py", "1"]) + assert not launcher.process_is_stopped(process, 0.5), \ + "Process stopped early." + launcher.cancel(process, 0.5) + + # Launch infinite loop, pay attention to term + process = launcher.launch([py_exe, "launch_busy.py"], + start_new_session=True) + assert not launcher.process_is_stopped(process, 0.1), \ + "Process stopped early." + launcher.cancel(process, 0.5) + + # Launch infinite loop, ignore term + process = launcher.launch([py_exe, "launch_busy.py", "1"], + start_new_session=True) + assert not launcher.process_is_stopped(process, 0.5), \ + "Process stopped early." + launcher.cancel(process, 0.5) + + # Check proper handling of ProcessLookupError + assert not launcher.killpg(process), "Expected lookup error." + assert not launcher.terminatepg(process), "Expected lookup error." + + # Launch finite loop, wait for termination + process = launcher.launch([py_exe, "launch_busy.py", "0", "0.1"]) + assert launcher.process_is_stopped(process, 0.5), \ + "Process should have stopped earlier." + + # Try simple kill + process = launcher.launch([py_exe, "launch_busy.py", "1"]) + assert not launcher.process_is_stopped(process, 0.5), \ + "Process stopped early." + launcher.cancel(process, 0) From 2ce7bf229f5d125c3e212ce4ae9820f18f031988 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 17:03:56 -0500 Subject: [PATCH 018/101] Introduce jassert wrapper for raising JobControllerException. --- libensemble/controller.py | 200 +++++++++++++++++--------------------- 1 file changed, 91 insertions(+), 109 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index 2d16002b1..4dd5d9100 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -24,18 +24,18 @@ #For debug messages in this module - uncomment (see libE.py to change root logging level) #logger.setLevel(logging.DEBUG) -STATES = ''' +STATES = """ UNKNOWN CREATED WAITING RUNNING FINISHED USER_KILLED -FAILED'''.split() +FAILED""".split() -SIGNALS = ''' +SIGNALS = """ SIGTERM -SIGKILL'''.split() +SIGKILL""".split() #I may want to use a top-level abstract/base class for maximum re-use @@ -43,23 +43,26 @@ class JobControllerException(Exception): pass +def jassert(test, *args): + if not test: + raise JobControllerException(*args) -class Job: - ''' +class Job: + """ Manage the creation, configuration and status of a launchable job. - ''' + """ newid = itertools.count() def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, hostlist=None, workdir=None, stdout=None, workerid=None): - '''Instantiate a new Job instance. + """Instantiate a new Job instance. A new job object is created with an id, status and configuration attributes This will normally be created by the job_controller on a launch - ''' + """ self.id = next(Job.newid) #Status attributes @@ -84,8 +87,9 @@ def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, rank self.stdout = stdout self.workerID = workerid - if app is None: - raise JobControllerException("Job must be created with an app - no app found for job {}".format(self.id)) + jassert(app is not None, + "Job must be created with an app - no app found for job {}". + format(self.id)) worker_name = "_worker{}".format(self.workerID) if self.workerID else "" self.name = "job_{}{}_{}".format(app.name, worker_name, self.id) @@ -93,15 +97,15 @@ def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, rank self.workdir = workdir def workdir_exists(self): - ''' Returns True if the job's workdir exists, else False ''' + """ Returns True if the job's workdir exists, else False """ return self.workdir and os.path.exists(self.workdir) def file_exists_in_workdir(self, filename): - ''' Returns True if the named file exists in the job's workdir, else False ''' + """ Returns True if the named file exists in the job's workdir, else False """ return self.workdir and os.path.exists(os.path.join(self.workdir, filename)) def read_file_in_workdir(self, filename): - ''' Open and reads the named file in the job's workdir ''' + """ Open and reads the named file in the job's workdir """ path = os.path.join(self.workdir, filename) if not os.path.exists(path): raise ValueError("{} not found in working directory".format(filename)) @@ -109,11 +113,11 @@ def read_file_in_workdir(self, filename): return f.read() def stdout_exists(self): - ''' Returns True if the job's stdout file exists in the workdir, else False ''' + """ Returns True if the job's stdout file exists in the workdir, else False """ return self.file_exists_in_workdir(self.stdout) def read_stdout(self): - ''' Open and reads the job's stdout file in the job's workdir ''' + """ Open and reads the job's stdout file in the job's workdir """ return self.read_file_in_workdir(self.stdout) @@ -136,20 +140,20 @@ def calc_job_timing(self): class BalsamJob(Job): - '''Wraps a Balsam Job from the Balsam service. + """Wraps a Balsam Job from the Balsam service. The same attributes and query routines are implemented. - ''' + """ #newid = itertools.count() #hopefully can use the one in Job def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, hostlist=None, workdir=None, stdout=None, workerid=None): - '''Instantiate a new BalsamJob instance. + """Instantiate a new BalsamJob instance. A new BalsamJob object is created with an id, status and configuration attributes This will normally be created by the job_controller on a launch - ''' + """ super().__init__(app, app_args, num_procs, num_nodes, ranks_per_node, machinefile, hostlist, workdir, stdout, workerid) @@ -182,7 +186,7 @@ def calc_job_timing(self): class JobController: - ''' The job_controller can create, poll and kill runnable jobs + """ The job_controller can create, poll and kill runnable jobs **Class Attributes:** @@ -191,7 +195,7 @@ class JobController: controller : Obj: JobController or inherited class. A class attribute holding the default job_controller. - ''' + """ controller = None @@ -210,15 +214,15 @@ def job_partition(num_procs, num_nodes, ranks_per_node, machinefile=None): #If all set then check num_procs equals num_nodes*ranks_per_node and set values as given if num_procs is not None and num_nodes is not None and ranks_per_node is not None: - if num_procs != num_nodes*ranks_per_node: - raise JobControllerException("num_procs does not equal num_nodes*ranks_per_node") + jassert(num_procs == num_nodes*ranks_per_node, + "num_procs does not equal num_nodes*ranks_per_node") return num_procs, num_nodes, ranks_per_node #If num_procs not set then need num_nodes and ranks_per_node and set num_procs if num_procs is None: #Note this covers case where none are set - may want to use job_controller defaults in that case - not implemented yet. - if num_nodes is None or ranks_per_node is None: - raise JobControllerException("Must set either num_procs or num_nodes/ranks_per_node or machinefile") + jassert(num_nodes is not None and ranks_per_node is not None, + "Must set either num_procs or num_nodes/ranks_per_node or machinefile") num_procs = num_nodes * ranks_per_node return num_procs, num_nodes, ranks_per_node @@ -255,7 +259,7 @@ def job_partition(num_procs, num_nodes, ranks_per_node, machinefile=None): #job.total_time = time.time() - job.launch_time def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, nodelist_env_cobalt=None): - '''Instantiate a new JobController instance. + """Instantiate a new JobController instance. A new JobController object is created with an application registry and configuration attributes. A registry object must have been created. @@ -278,11 +282,10 @@ def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, The environment variable giving a node list in Cobalt format (Default: Uses COBALT_PARTNAME) Note: This is only queried if a worker_list file is not provided and auto_resources=True. - ''' + """ self.registry = registry or Register.default_registry - if self.registry is None: - raise JobControllerException("Cannot find default registry") + jassert(self.registry is not None, "Cannot find default registry") self.top_level_dir = os.getcwd() self.auto_resources = auto_resources @@ -355,7 +358,7 @@ def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, app_args=None, stdout=None, stage_inout=None, hyperthreads=False, test=False): - ''' Creates a new job, and either launches or schedules to launch in the job controller + """ Creates a new job, and either launches or schedules to launch in the job controller The created job object is returned. @@ -402,17 +405,17 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, Note that if some combination of num_procs, num_nodes and ranks_per_node are provided, these will be honored if possible. If resource detection is on and these are omitted, then the available resources will be divided amongst workers. - ''' + """ # Find the default sim or gen app from registry.sim_default_app OR registry.gen_default_app # Could take optional app arg - if they want to supply here - instead of taking from registry if calc_type == 'sim': - if self.registry.sim_default_app is None: - raise JobControllerException("Default sim app is not set") + jassert(self.registry.sim_default_app is not None, + "Default sim app is not set") app = self.registry.sim_default_app elif calc_type == 'gen': - if self.registry.gen_default_app is None: - raise JobControllerException("Default gen app is not set") + jassert(self.registry.gen_default_app is not None, + "Default gen app is not set") app = self.registry.gen_default_app else: raise JobControllerException("Unrecognized calculation type", calc_type) @@ -436,8 +439,7 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, else: machinefile = 'machinefile_autogen' mfile_created, num_procs, num_nodes, ranks_per_node = self.create_machinefile(machinefile, num_procs, num_nodes, ranks_per_node, hyperthreads) - if not mfile_created: - raise JobControllerException("Auto-creation of machinefile failed") + jassert(mfile_created, "Auto-creation of machinefile failed") else: num_procs, num_nodes, ranks_per_node = JobController.job_partition(num_procs, num_nodes, ranks_per_node, machinefile) @@ -514,7 +516,7 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, def poll(self, job): - ''' Polls and updates the status attributes of the supplied job + """ Polls and updates the status attributes of the supplied job Parameters ----------- @@ -522,17 +524,15 @@ def poll(self, job): job: obj: Job The job object.to be polled. - ''' + """ - if not isinstance(job, Job): - raise JobControllerException('Invalid job has been provided') + jassert(isinstance(job, Job), "Invalid job has been provided") # Check the jobs been launched (i.e. it has a process ID) - if job.process is None: - #logger.warning('Polled job has no process ID - returning stored state') - #Prob should be recoverable and return state - but currently fatal - raise JobControllerException('Polled job {} has no process ID - check jobs been launched'.format(job.name)) - + #Prob should be recoverable and return state - but currently fatal + jassert(job.process is not None, + "Polled job {} has no process ID - check jobs been launched". + format(job.name)) # Do not poll if job already finished # Maybe should re-poll job to check (in case self.finished set in error!)??? if job.finished: @@ -570,7 +570,7 @@ def poll(self, job): #return job def manager_poll(self, job): - ''' Polls for a manager signal + """ Polls for a manager signal Parameters ----------- @@ -581,7 +581,7 @@ def manager_poll(self, job): The job status attribute job.manager_signal will be updated. - ''' + """ #Will use MPI_MODE from settings.py but for now assume MPI from libensemble.message_numbers import STOP_TAG, MAN_SIGNAL_FINISH, MAN_SIGNAL_KILL @@ -624,7 +624,7 @@ def _time_out(process, timeout): def kill(self, job): - ''' Kills or cancels the supplied job + """ Kills or cancels the supplied job Parameters ----------- @@ -638,10 +638,9 @@ def kill(self, job): a SIGKILL will be sent if the job has not finished after seconds. The kill can be configured using the set_kill_mode function. - ''' + """ - if not isinstance(job, Job): - raise JobControllerException('Invalid job has been provided') + jassert(isinstance(job, Job), "Invalid job has been provided") if job.finished: logger.warning('Trying to kill job that is no longer running. Job {}: Status is {}'.format(job.name, job.state)) @@ -649,17 +648,14 @@ def kill(self, job): if job.process is None: time.sleep(0.2) - if job.process is None: - #logger.warning('Polled job has no process ID - returning stored state') - #Prob should be recoverable and return state - but currently fatal - raise JobControllerException('Attempting to kill job {} that has no process ID - check jobs been launched'.format(job.name)) + jassert(job.process is not None, + "Attempting to kill job {} that has no process ID - check jobs been launched".format(job.name)) logger.debug("Killing job {}".format(job.name)) # Issue signal sig = {'SIGTERM': signal.SIGTERM, 'SIGKILL': signal.SIGKILL} - if self.kill_signal not in sig: - raise JobControllerException('Unknown kill signal') + jassert(self.kill_signal in sig, "Unknown kill signal") try: JobController._kill_process(job.process, sig[self.kill_signal]) except OSError: # In Python 3, ProcessLookupError @@ -701,7 +697,7 @@ def kill(self, job): def set_kill_mode(self, signal=None, wait_and_kill=None, wait_time=None): - ''' Configures the kill mode for the job_controller + """ Configures the kill mode for the job_controller Parameters ---------- @@ -717,10 +713,9 @@ def set_kill_mode(self, signal=None, wait_and_kill=None, wait_time=None): (Default is 60). - ''' + """ if signal is not None: - if signal not in SIGNALS: - raise JobControllerException("Unknown signal {} supplied to set_kill_mode".format(signal)) + jassert(signal in SIGNALS, "Unknown signal {} supplied to set_kill_mode".format(signal)) self.kill_signal = signal if wait_and_kill is not None: @@ -733,7 +728,7 @@ def set_kill_mode(self, signal=None, wait_and_kill=None, wait_time=None): def get_job(self, jobid): - ''' Returns the job object for the supplied job ID ''' + """ Returns the job object for the supplied job ID """ if self.list_of_jobs: for job in self.list_of_jobs: if job.id == jobid: @@ -776,8 +771,7 @@ def get_resources(self, num_procs=None, num_nodes=None, ranks_per_node=None, hyp else: cores_avail_per_node_per_worker = cores_avail_per_node - if not node_list: - raise JobControllerException("Node list is empty - aborting") + jassert(node_list, "Node list is empty - aborting") #If no decomposition supplied - use all available cores/nodes if num_procs is None and num_nodes is None and ranks_per_node is None: @@ -798,23 +792,20 @@ def get_resources(self, num_procs=None, num_nodes=None, ranks_per_node=None, hyp #checks config is consistent and sufficient to express - does not check actual resources num_procs, num_nodes, ranks_per_node = JobController.job_partition(num_procs, num_nodes, ranks_per_node) - if num_nodes > local_node_count: - #Could just downgrade to those available with warning - for now error - raise JobControllerException("Not enough nodes to honour arguments. Requested {}. Only {} available".format(num_nodes, local_node_count)) + #Could just downgrade to those available with warning - for now error + jassert(num_nodes <= local_node_count, + "Not enough nodes to honour arguments. Requested {}. Only {} available".format(num_nodes, local_node_count)) - elif ranks_per_node > cores_avail_per_node: - #Could just downgrade to those available with warning - for now error - raise JobControllerException("Not enough processors on a node to honour arguments. Requested {}. Only {} available".format(ranks_per_node, cores_avail_per_node)) + jassert(ranks_per_node <= cores_avail_per_node, + "Not enough processors on a node to honour arguments. Requested {}. Only {} available".format(ranks_per_node, cores_avail_per_node)) - elif ranks_per_node > cores_avail_per_node_per_worker: - #Could just downgrade to those available with warning - for now error - raise JobControllerException("Not enough processors per worker to honour arguments. Requested {}. Only {} available".format(ranks_per_node, cores_avail_per_node_per_worker)) + jassert(ranks_per_node <= cores_avail_per_node_per_worker, + "Not enough processors per worker to honour arguments. Requested {}. Only {} available".format(ranks_per_node, cores_avail_per_node_per_worker)) - elif num_procs > (cores_avail_per_node * local_node_count): - #Could just downgrade to those available with warning - for now error - raise JobControllerException("Not enough procs to honour arguments. Requested {}. Only {} available".format(num_procs, cores_avail_per_node*local_node_count)) + jassert(num_procs <= (cores_avail_per_node * local_node_count), + "Not enough procs to honour arguments. Requested {}. Only {} available".format(num_procs, cores_avail_per_node*local_node_count)) - elif num_nodes < local_node_count: + if num_nodes < local_node_count: logger.warning("User constraints mean fewer nodes being used than available. {} nodes used. {} nodes available".format(num_nodes, local_node_count)) return num_procs, num_nodes, ranks_per_node @@ -861,27 +852,26 @@ def get_hostlist(self): class BalsamJobController(JobController): - '''Inherits from JobController and wraps the Balsam job management service + """Inherits from JobController and wraps the Balsam job management service .. note:: Job kills are currently not configurable in the Balsam job_controller. The set_kill_mode function will do nothing but print a warning. - ''' + """ #controller = None def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, nodelist_env_cobalt=None): - '''Instantiate a new BalsamJobController instance. + """Instantiate a new BalsamJobController instance. A new BalsamJobController object is created with an application registry and configuration attributes - ''' + """ #Will use super - atleast if use baseclass - but for now dont want to set self.mpi_launcher etc... self.registry = registry or Register.default_registry - if self.registry is None: - raise JobControllerException("Cannot find default registry") + jassert(self.registry, "Cannot find default registry") self.top_level_dir = os.getcwd() self.auto_resources = auto_resources @@ -913,24 +903,20 @@ def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, app_args=None, stdout=None, stage_inout=None, test=False, hyperthreads=False): - ''' Creates a new job, and either launches or schedules to launch in the job controller + """ Creates a new job, and either launches or schedules to launch in the job controller The created job object is returned. - ''' + """ import balsam.launcher.dag as dag # Find the default sim or gen app from registry.sim_default_app OR registry.gen_default_app # Could take optional app arg - if they want to supply here - instead of taking from registry if calc_type == 'sim': - if self.registry.sim_default_app is None: - raise JobControllerException("Default sim app is not set") - else: - app = self.registry.sim_default_app + jassert(self.registry.sim_default_app, "Default sim app is not set") + app = self.registry.sim_default_app elif calc_type == 'gen': - if self.registry.gen_default_app is not None: - raise JobControllerException("Default gen app is not set") - else: - app = self.registry.gen_default_app + jassert(self.registry.gen_default_app, "Default gen app is not set") + app = self.registry.gen_default_app else: raise JobControllerException("Unrecognized calculation type", calc_type) @@ -941,8 +927,8 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, #Specific to this class if machinefile is not None: logger.warning("machinefile arg ignored - not supported in Balsam") - if num_procs is None and num_nodes is None and ranks_per_node is None: - raise JobControllerException("No procs/nodes provided - aborting") + jassert(num_procs or num_nodes or ranks_per_node, + "No procs/nodes provided - aborting") #Set num_procs, num_nodes and ranks_per_node for this job @@ -998,15 +984,12 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, def poll(self, job): - ''' Polls and updates the status attributes of the supplied job ''' - if not isinstance(job, BalsamJob): - raise JobControllerException('Invalid job has been provided') + """ Polls and updates the status attributes of the supplied job """ + jassert(isinstance(job, BalsamJob), "Invalid job has been provided") # Check the jobs been launched (i.e. it has a process ID) - if job.process is None: - #logger.warning('Polled job has no process ID - returning stored state') - #Prob should be recoverable and return state - but currently fatal - raise JobControllerException('Polled job has no process ID - check jobs been launched') + #Prob should be recoverable and return state - but currently fatal + jassert(job.process, "Polled job has no process ID - check jobs been launched") # Do not poll if job already finished if job.finished: @@ -1059,10 +1042,9 @@ def poll(self, job): #return job def kill(self, job): - ''' Kills or cancels the supplied job ''' + """ Kills or cancels the supplied job """ - if not isinstance(job, BalsamJob): - raise JobControllerException('Invalid job has been provided') + jassert(isinstance(job, BalsamJob), "Invalid job has been provided") import balsam.launcher.dag as dag dag.kill(job.process) @@ -1076,8 +1058,8 @@ def kill(self, job): #Check if can wait for kill to complete - affect signal used etc.... def set_kill_mode(self, signal=None, wait_and_kill=None, wait_time=None): - ''' Not currently implemented for BalsamJobController. + """ Not currently implemented for BalsamJobController. No action is taken - ''' + """ logger.warning("set_kill_mode currently has no action with Balsam controller") From f9667e057313550008e3d4a3d60a2dbe934f1928 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 17:08:43 -0500 Subject: [PATCH 019/101] Replace logic in controller job_partition with simplified equivalent. --- libensemble/controller.py | 45 +++++++++++++-------------------------- 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index 4dd5d9100..04923bc86 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -204,43 +204,28 @@ def job_partition(num_procs, num_nodes, ranks_per_node, machinefile=None): """ Takes provided nprocs/nodes/ranks and outputs working configuration of procs/nodes/ranks or error """ #If machinefile is provided - ignore everything else - if machinefile is not None: - if num_procs is not None or num_nodes is not None or ranks_per_node is not None: + if machinefile: + if num_procs or num_nodes or ranks_per_node: logger.warning('Machinefile provided - overriding procs/nodes/ranks_per_node') - num_procs = None - num_nodes = None - ranks_per_node = None - return num_procs, num_nodes, ranks_per_node - - #If all set then check num_procs equals num_nodes*ranks_per_node and set values as given - if num_procs is not None and num_nodes is not None and ranks_per_node is not None: - jassert(num_procs == num_nodes*ranks_per_node, - "num_procs does not equal num_nodes*ranks_per_node") - return num_procs, num_nodes, ranks_per_node - - #If num_procs not set then need num_nodes and ranks_per_node and set num_procs - if num_procs is None: - #Note this covers case where none are set - may want to use job_controller defaults in that case - not implemented yet. - jassert(num_nodes is not None and ranks_per_node is not None, + return None, None, None + + if not num_procs: + jassert(num_nodes and ranks_per_node, "Must set either num_procs or num_nodes/ranks_per_node or machinefile") num_procs = num_nodes * ranks_per_node - return num_procs, num_nodes, ranks_per_node - #If num_procs is set - fill in any other values - #if num_procs is not None: - else: - if num_nodes is None: - if ranks_per_node is None: - #Currently not auto-detecting so if only num_procs - you are on 1 node - num_nodes = 1 - ranks_per_node = num_procs - else: - num_nodes = num_procs//ranks_per_node - else: - ranks_per_node = num_procs//num_nodes + elif not num_nodes: + ranks_per_node = ranks_per_node or num_procs + num_nodes = num_procs//ranks_per_node + + elif not ranks_per_node: + ranks_per_node = num_procs//num_nodes + jassert(num_procs == num_nodes*ranks_per_node, + "num_procs does not equal num_nodes*ranks_per_node") return num_procs, num_nodes, ranks_per_node + #def _calc_job_timing(job): #if job.launch_time is None: From 1bbff50299738ad4221d003597a63aa238094662 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 17:16:27 -0500 Subject: [PATCH 020/101] Cleaning up some comment strings. --- libensemble/controller.py | 133 +++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 67 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index 04923bc86..f78f36d00 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -97,15 +97,15 @@ def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, rank self.workdir = workdir def workdir_exists(self): - """ Returns True if the job's workdir exists, else False """ + """Returns True if the job's workdir exists""" return self.workdir and os.path.exists(self.workdir) def file_exists_in_workdir(self, filename): - """ Returns True if the named file exists in the job's workdir, else False """ + """Returns True if the named file exists in the job's workdir""" return self.workdir and os.path.exists(os.path.join(self.workdir, filename)) def read_file_in_workdir(self, filename): - """ Open and reads the named file in the job's workdir """ + """Open and reads the named file in the job's workdir """ path = os.path.join(self.workdir, filename) if not os.path.exists(path): raise ValueError("{} not found in working directory".format(filename)) @@ -113,11 +113,11 @@ def read_file_in_workdir(self, filename): return f.read() def stdout_exists(self): - """ Returns True if the job's stdout file exists in the workdir, else False """ + """Returns True if the job's stdout file exists in the workdir""" return self.file_exists_in_workdir(self.stdout) def read_stdout(self): - """ Open and reads the job's stdout file in the job's workdir """ + """Open and reads the job's stdout file in the job's workdir""" return self.read_file_in_workdir(self.stdout) @@ -139,7 +139,6 @@ def calc_job_timing(self): class BalsamJob(Job): - """Wraps a Balsam Job from the Balsam service. The same attributes and query routines are implemented. @@ -151,8 +150,9 @@ class BalsamJob(Job): def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, hostlist=None, workdir=None, stdout=None, workerid=None): """Instantiate a new BalsamJob instance. - A new BalsamJob object is created with an id, status and configuration attributes - This will normally be created by the job_controller on a launch + A new BalsamJob object is created with an id, status and + configuration attributes. This will normally be created by the + job_controller on a launch. """ super().__init__(app, app_args, num_procs, num_nodes, ranks_per_node, machinefile, hostlist, workdir, stdout, workerid) @@ -226,47 +226,37 @@ def job_partition(num_procs, num_nodes, ranks_per_node, machinefile=None): return num_procs, num_nodes, ranks_per_node - #def _calc_job_timing(job): - - #if job.launch_time is None: - #logger.warning("Cannot calc job timing - launch time not set") - #return - - ##In case already been killed and set then - #if job.runtime is None: - #job.runtime = time.time() - job.launch_time - - ##For direct launched jobs - these should be the same. - #if job.total_time is None: - #if job.runtime is not None: - #job.total_time = job.runtime - #else: - #job.total_time = time.time() - job.launch_time - def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, nodelist_env_cobalt=None): """Instantiate a new JobController instance. - A new JobController object is created with an application registry and configuration attributes. A - registry object must have been created. + A new JobController object is created with an application + registry and configuration attributes. A registry object must + have been created. - This is typically created in the user calling script. If auto_resources is True, an evaluation of system resources is performance during this call. + This is typically created in the user calling script. If + auto_resources is True, an evaluation of system resources is + performance during this call. Parameters ---------- registry: obj: Registry, optional - A registry containing the applications to use in this job_controller (Default: Use Register.default_registry). + A registry containing the applications to use in this + job_controller (Default: Use Register.default_registry). auto_resources: Boolean, optional - Auto-detect available processor resources and assign to jobs if not explicitly provided on launch. + Auto-detect available processor resources and assign to jobs + if not explicitly provided on launch. nodelist_env_slurm: String, optional - The environment variable giving a node list in Slurm format (Default: Uses SLURM_NODELIST) - Note: This is only queried if a worker_list file is not provided and auto_resources=True. + The environment variable giving a node list in Slurm format + (Default: Uses SLURM_NODELIST). Note: This is only queried if + a worker_list file is not provided and auto_resources=True. nodelist_env_cobalt: String, optional - The environment variable giving a node list in Cobalt format (Default: Uses COBALT_PARTNAME) - Note: This is only queried if a worker_list file is not provided and auto_resources=True. - + The environment variable giving a node list in Cobalt format + (Default: Uses COBALT_PARTNAME) Note: This is only queried + if a worker_list file is not provided and + auto_resources=True. """ self.registry = registry or Register.default_registry @@ -343,7 +333,7 @@ def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, app_args=None, stdout=None, stage_inout=None, hyperthreads=False, test=False): - """ Creates a new job, and either launches or schedules to launch in the job controller + """Creates a new job, and either launches or schedules launch. The created job object is returned. @@ -366,19 +356,22 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, Name of a machinefile for this job to use. app_args: string, optional - A string of the application arguments to be added to job launch command line. + A string of the application arguments to be added to job + launch command line. stdout: string, optional A standard output filename. stage_inout: string, optional - A directory to copy files from. Default will take from current directory. + A directory to copy files from. Default will take from + current directory. hyperthreads: boolean, optional Whether to launch MPI tasks to hyperthreads test: boolean, optional - Whether this is a test - No job will be launched. Instead runline is printed to logger (At INFO level). + Whether this is a test - No job will be launched. Instead + runline is printed to logger (At INFO level). Returns @@ -388,9 +381,11 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, The lauched job object. - Note that if some combination of num_procs, num_nodes and ranks_per_node are provided, these will be honored if possible. If resource detection is on and these are omitted, then the available resources will be divided amongst workers. - - """ + Note that if some combination of num_procs, num_nodes and + ranks_per_node are provided, these will be honored if + possible. If resource detection is on and these are omitted, + then the available resources will be divided amongst workers. +""" # Find the default sim or gen app from registry.sim_default_app OR registry.gen_default_app # Could take optional app arg - if they want to supply here - instead of taking from registry @@ -609,7 +604,7 @@ def _time_out(process, timeout): def kill(self, job): - """ Kills or cancels the supplied job + """Kills or cancels the supplied job Parameters ----------- @@ -617,12 +612,12 @@ def kill(self, job): job: obj: Job The job object.to be polled. - - The signal used is determined by the job_controller attirbute will be send to the job, - followed by a wait for the process to terminate. If the attribute is True, then - a SIGKILL will be sent if the job has not finished after seconds. The kill can be - configured using the set_kill_mode function. - + The signal used is determined by the job_controller attribute + will be send to the job, followed by a wait for + the process to terminate. If the attribute is + True, then a SIGKILL will be sent if the job has not finished + after seconds. The kill can be configured using the + set_kill_mode function. """ jassert(isinstance(job, Job), "Invalid job has been provided") @@ -682,7 +677,7 @@ def kill(self, job): def set_kill_mode(self, signal=None, wait_and_kill=None, wait_time=None): - """ Configures the kill mode for the job_controller + """Configures the kill mode for the job_controller Parameters ---------- @@ -691,13 +686,12 @@ def set_kill_mode(self, signal=None, wait_and_kill=None, wait_time=None): The signal type to be sent to kill job: 'SIGTERM' or 'SIGKILL' wait_and_kill: boolean, optional - If True, a SIGKILL will be sent after seconds if the process has not terminated. + If True, a SIGKILL will be sent after seconds if + the process has not terminated. wait_time: int, optional - The number of seconds to wait for the job to finish before sending a SIGKILL when wait_and_kill is set. - (Default is 60). - - + The number of seconds to wait for the job to finish before + sending a SIGKILL when wait_and_kill is set. (Default is 60). """ if signal is not None: jassert(signal in SIGNALS, "Unknown signal {} supplied to set_kill_mode".format(signal)) @@ -731,13 +725,15 @@ def set_workerID(self, workerid): #Reformat create_machinefile to use this and also use this for non-machinefile cases when auto-detecting def get_resources(self, num_procs=None, num_nodes=None, ranks_per_node=None, hyperthreads=False): - """ - Reconciles user supplied options with available Worker resources to produce run configuration. + """Reconciles user supplied options with available Worker + resources to produce run configuration. - Detects resources available to worker, checks if an existing user supplied config is valid, - and fills in any missing config information (ie. num_procs/num_nodes/ranks_per_node) + Detects resources available to worker, checks if an existing + user supplied config is valid, and fills in any missing config + information (ie. num_procs/num_nodes/ranks_per_node) - User supplied config options are honoured, and an exception is raised if these are infeasible. + User supplied config options are honoured, and an exception is + raised if these are infeasible. """ node_list = self.resources.local_nodelist @@ -798,7 +794,8 @@ def get_resources(self, num_procs=None, num_nodes=None, ranks_per_node=None, hyp def create_machinefile(self, machinefile=None, num_procs=None, num_nodes=None, ranks_per_node=None, hyperthreads=False): - """Create a machinefile based on user supplied config options, completed by detected machine resources""" + """Create a machinefile based on user supplied config options, + completed by detected machine resources""" #Maybe hyperthreads should be mpi_hyperthreads @@ -829,14 +826,14 @@ def create_machinefile(self, machinefile=None, num_procs=None, num_nodes=None, r #will prob want to adjust based on input #def get_hostlist(self, machinefile=None, num_procs=None, num_nodes=None, ranks_per_node=None, hyperthreads=False): def get_hostlist(self): - """Create a hostlist based on user supplied config options, completed by detected machine resources""" + """Create a hostlist based on user supplied config options, + completed by detected machine resources""" node_list = self.resources.local_nodelist hostlist_str = ",".join([str(x) for x in node_list]) return hostlist_str class BalsamJobController(JobController): - """Inherits from JobController and wraps the Balsam job management service .. note:: Job kills are currently not configurable in the Balsam job_controller. @@ -850,7 +847,8 @@ class BalsamJobController(JobController): def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, nodelist_env_cobalt=None): """Instantiate a new BalsamJobController instance. - A new BalsamJobController object is created with an application registry and configuration attributes + A new BalsamJobController object is created with an application + registry and configuration attributes """ #Will use super - atleast if use baseclass - but for now dont want to set self.mpi_launcher etc... @@ -888,7 +886,8 @@ def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, app_args=None, stdout=None, stage_inout=None, test=False, hyperthreads=False): - """ Creates a new job, and either launches or schedules to launch in the job controller + """Creates a new job, and either launches or schedules to launch + in the job controller The created job object is returned. """ @@ -969,7 +968,7 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, def poll(self, job): - """ Polls and updates the status attributes of the supplied job """ + """Polls and updates the status attributes of the supplied job""" jassert(isinstance(job, BalsamJob), "Invalid job has been provided") # Check the jobs been launched (i.e. it has a process ID) @@ -1043,7 +1042,7 @@ def kill(self, job): #Check if can wait for kill to complete - affect signal used etc.... def set_kill_mode(self, signal=None, wait_and_kill=None, wait_time=None): - """ Not currently implemented for BalsamJobController. + """Not currently implemented for BalsamJobController. No action is taken """ From ecd3af073609f72a4dfe2b400f39b4901f9671bd Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 17:23:35 -0500 Subject: [PATCH 021/101] Clean app setup at start of launch (Controller and BalsamController) --- libensemble/controller.py | 30 ++++++++---------------------- libensemble/register.py | 1 - 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index f78f36d00..84f73be59 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -387,18 +387,10 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, then the available resources will be divided amongst workers. """ - # Find the default sim or gen app from registry.sim_default_app OR registry.gen_default_app - # Could take optional app arg - if they want to supply here - instead of taking from registry - if calc_type == 'sim': - jassert(self.registry.sim_default_app is not None, - "Default sim app is not set") - app = self.registry.sim_default_app - elif calc_type == 'gen': - jassert(self.registry.gen_default_app is not None, - "Default gen app is not set") - app = self.registry.gen_default_app - else: - raise JobControllerException("Unrecognized calculation type", calc_type) + app = self.registry.default_app(calc_type) + jassert(calc_type in ['sim', 'gen'], + "Unrecognized calculation type", calc_type) + jassert(app, "Default {} app is not set".format(calc_type)) #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# @@ -893,16 +885,10 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, """ import balsam.launcher.dag as dag - # Find the default sim or gen app from registry.sim_default_app OR registry.gen_default_app - # Could take optional app arg - if they want to supply here - instead of taking from registry - if calc_type == 'sim': - jassert(self.registry.sim_default_app, "Default sim app is not set") - app = self.registry.sim_default_app - elif calc_type == 'gen': - jassert(self.registry.gen_default_app, "Default gen app is not set") - app = self.registry.gen_default_app - else: - raise JobControllerException("Unrecognized calculation type", calc_type) + app = self.registry.default_app(calc_type) + jassert(calc_type in ['sim', 'gen'], + "Unrecognized calculation type", calc_type) + jassert(app, "Default {} app is not set".format(calc_type)) #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# diff --git a/libensemble/register.py b/libensemble/register.py index bd7a23700..2dbc12145 100644 --- a/libensemble/register.py +++ b/libensemble/register.py @@ -52,7 +52,6 @@ def gen_default_app(self): """Return the default generator app.""" return self._default_apps['gen'] - @property def default_app(self, calc_type): """Return the default calc_type app.""" return self._default_apps.get(calc_type) From 88e84f1ac7f2e1033f2d4270b8a1c063e4133449 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 17:24:50 -0500 Subject: [PATCH 022/101] Kill commented-out calc_timing. --- libensemble/controller.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index 84f73be59..401e68065 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -865,16 +865,6 @@ def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, JobController.controller = self #BalsamJobController.controller = self - #def _calc_job_timing(job): - ##Get runtime from Balsam - #if job.launch_time is None: - #logger.warning("Cannot calc job total_time - launch time not set") - #return - - #if job.total_time is None: - #job.total_time = time.time() - job.launch_time - - def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, app_args=None, stdout=None, stage_inout=None, test=False, hyperthreads=False): From 4e6a6cbf4972e56df010d69eadbc301842911a01 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 17:39:44 -0500 Subject: [PATCH 023/101] Misc cleanup in controller. --- libensemble/controller.py | 117 +++++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 46 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index 401e68065..a99e981e5 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -56,8 +56,9 @@ class Job: newid = itertools.count() - def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, ranks_per_node=None, - machinefile=None, hostlist=None, workdir=None, stdout=None, workerid=None): + def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, + ranks_per_node=None, machinefile=None, hostlist=None, + workdir=None, stdout=None, workerid=None): """Instantiate a new Job instance. A new job object is created with an id, status and configuration attributes @@ -147,7 +148,9 @@ class BalsamJob(Job): #newid = itertools.count() #hopefully can use the one in Job - def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, hostlist=None, workdir=None, stdout=None, workerid=None): + def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, + ranks_per_node=None, machinefile=None, hostlist=None, + workdir=None, stdout=None, workerid=None): """Instantiate a new BalsamJob instance. A new BalsamJob object is created with an id, status and @@ -155,11 +158,13 @@ def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, rank job_controller on a launch. """ - super().__init__(app, app_args, num_procs, num_nodes, ranks_per_node, machinefile, hostlist, workdir, stdout, workerid) + super().__init__(app, app_args, num_procs, num_nodes, ranks_per_node, + machinefile, hostlist, workdir, stdout, workerid) self.balsam_state = None - #prob want to override workdir attribute with Balsam value - though does it exist yet? + #prob want to override workdir attribute with Balsam value - + #though does it exist yet? #self.workdir = None #Don't know until starts running self.workdir = workdir #Default for libe now is to run in place. @@ -185,8 +190,7 @@ def calc_job_timing(self): class JobController: - - """ The job_controller can create, poll and kill runnable jobs + """The job_controller can create, poll and kill runnable jobs **Class Attributes:** @@ -226,7 +230,8 @@ def job_partition(num_procs, num_nodes, ranks_per_node, machinefile=None): return num_procs, num_nodes, ranks_per_node - def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, nodelist_env_cobalt=None): + def __init__(self, registry=None, auto_resources=True, + nodelist_env_slurm=None, nodelist_env_cobalt=None): """Instantiate a new JobController instance. A new JobController object is created with an application @@ -331,8 +336,9 @@ def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, #else: #setattr(job, k, v) - def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, - machinefile=None, app_args=None, stdout=None, stage_inout=None, hyperthreads=False, test=False): + def launch(self, calc_type, num_procs=None, num_nodes=None, + ranks_per_node=None, machinefile=None, app_args=None, + stdout=None, stage_inout=None, hyperthreads=False, test=False): """Creates a new job, and either launches or schedules launch. The created job object is returned. @@ -406,19 +412,22 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, hostlist = self.get_hostlist() else: #machinefile + machinefile = "machinefile_autogen" if self.workerID is not None: - machinefile = 'machinefile_autogen_for_worker_' + str(self.workerID) - else: - machinefile = 'machinefile_autogen' - mfile_created, num_procs, num_nodes, ranks_per_node = self.create_machinefile(machinefile, num_procs, num_nodes, ranks_per_node, hyperthreads) + machinefile += "_for_worker_{}".format(self.workerID) + mfile_created, num_procs, num_nodes, ranks_per_node = \ + self.create_machinefile(machinefile, num_procs, num_nodes, + ranks_per_node, hyperthreads) jassert(mfile_created, "Auto-creation of machinefile failed") else: - num_procs, num_nodes, ranks_per_node = JobController.job_partition(num_procs, num_nodes, ranks_per_node, machinefile) - + num_procs, num_nodes, ranks_per_node = \ + JobController.job_partition(num_procs, num_nodes, + ranks_per_node, machinefile) - default_workdir = os.getcwd() #Will be possible to override with arg when implemented - job = Job(app, app_args, num_procs, num_nodes, ranks_per_node, machinefile, hostlist, default_workdir, stdout, self.workerID) + default_workdir = os.getcwd() # May override with arg when implemented + job = Job(app, app_args, num_procs, num_nodes, ranks_per_node, + machinefile, hostlist, default_workdir, stdout, self.workerID) #Temporary perhaps - though when create workdirs - will probably keep output in place if stage_inout is not None: @@ -715,8 +724,10 @@ def set_workerID(self, workerid): self.workerID = workerid - #Reformat create_machinefile to use this and also use this for non-machinefile cases when auto-detecting - def get_resources(self, num_procs=None, num_nodes=None, ranks_per_node=None, hyperthreads=False): + #Reformat create_machinefile to use this and also use this for + #non-machinefile cases when auto-detecting + def get_resources(self, num_procs=None, num_nodes=None, + ranks_per_node=None, hyperthreads=False): """Reconciles user supplied options with available Worker resources to produce run configuration. @@ -747,36 +758,49 @@ def get_resources(self, num_procs=None, num_nodes=None, ranks_per_node=None, hyp jassert(node_list, "Node list is empty - aborting") #If no decomposition supplied - use all available cores/nodes - if num_procs is None and num_nodes is None and ranks_per_node is None: + if not num_procs and not num_nodes and not ranks_per_node: num_nodes = local_node_count ranks_per_node = cores_avail_per_node_per_worker - #logger - logger.debug("No decomposition supplied - using all available resource. Nodes: {} ranks_per_node {}".format(num_nodes, ranks_per_node)) - elif num_nodes is None and ranks_per_node is None: - #Got just num_procs + logger.debug("No decomposition supplied - " + "using all available resource. " + "Nodes: {} ranks_per_node {}". + format(num_nodes, ranks_per_node)) + elif not num_nodes and not ranks_per_node: num_nodes = local_node_count - #Here is where really want a compact/scatter option - go for scatter (could get cores and say if less than one node - but then hyperthreads complication if no psutil installed) - elif num_procs is None and ranks_per_node is None: + #Here is where really want a compact/scatter option - go for + #scatter (could get cores and say if less than one node - but then + #hyperthreads complication if no psutil installed) + elif not num_procs and not ranks_per_node: #Who would just put num_nodes??? ranks_per_node = cores_avail_per_node_per_worker - elif num_procs is None and num_nodes is None: + elif not num_procs and not num_nodes: num_nodes = local_node_count - #checks config is consistent and sufficient to express - does not check actual resources - num_procs, num_nodes, ranks_per_node = JobController.job_partition(num_procs, num_nodes, ranks_per_node) + #checks config is consistent and sufficient to express - + #does not check actual resources + num_procs, num_nodes, ranks_per_node = \ + JobController.job_partition(num_procs, num_nodes, ranks_per_node) #Could just downgrade to those available with warning - for now error jassert(num_nodes <= local_node_count, - "Not enough nodes to honour arguments. Requested {}. Only {} available".format(num_nodes, local_node_count)) + "Not enough nodes to honour arguments. " + "Requested {}. Only {} available". + format(num_nodes, local_node_count)) jassert(ranks_per_node <= cores_avail_per_node, - "Not enough processors on a node to honour arguments. Requested {}. Only {} available".format(ranks_per_node, cores_avail_per_node)) + "Not enough processors on a node to honour arguments. " + "Requested {}. Only {} available". + format(ranks_per_node, cores_avail_per_node)) jassert(ranks_per_node <= cores_avail_per_node_per_worker, - "Not enough processors per worker to honour arguments. Requested {}. Only {} available".format(ranks_per_node, cores_avail_per_node_per_worker)) + "Not enough processors per worker to honour arguments. " + "Requested {}. Only {} available". + format(ranks_per_node, cores_avail_per_node_per_worker)) jassert(num_procs <= (cores_avail_per_node * local_node_count), - "Not enough procs to honour arguments. Requested {}. Only {} available".format(num_procs, cores_avail_per_node*local_node_count)) + "Not enough procs to honour arguments. " + "Requested {}. Only {} available". + format(num_procs, cores_avail_per_node*local_node_count)) if num_nodes < local_node_count: logger.warning("User constraints mean fewer nodes being used than available. {} nodes used. {} nodes available".format(num_nodes, local_node_count)) @@ -785,34 +809,33 @@ def get_resources(self, num_procs=None, num_nodes=None, ranks_per_node=None, hyp - def create_machinefile(self, machinefile=None, num_procs=None, num_nodes=None, ranks_per_node=None, hyperthreads=False): + def create_machinefile(self, machinefile=None, num_procs=None, + num_nodes=None, ranks_per_node=None, + hyperthreads=False): """Create a machinefile based on user supplied config options, completed by detected machine resources""" #Maybe hyperthreads should be mpi_hyperthreads - if machinefile is None: - machinefile = 'machinefile' - + machinefile = machinefile or 'machinefile' if os.path.isfile(machinefile): try: os.remove(machinefile) except: pass - #num_procs, num_nodes, ranks_per_node = self.get_resources(num_procs=num_procs, num_nodes=num_nodes, ranks_per_node=ranks_per_node, hyperthreads=hyperthreads) node_list = self.resources.local_nodelist - logger.debug("Creating machinefile with {} nodes and {} ranks per node".format(num_nodes, ranks_per_node)) + logger.debug("Creating machinefile with {} nodes and {} ranks per node". + format(num_nodes, ranks_per_node)) with open(machinefile, 'w') as f: for node in node_list[:num_nodes]: f.write((node + '\n') * ranks_per_node) #Return true if created and not empty - built_mfile = os.path.isfile(machinefile) and os.path.getsize(machinefile) > 0 - - #Return new values for num_procs,num_nodes,ranks_per_node - in case want to use + built_mfile = (os.path.isfile(machinefile) + and os.path.getsize(machinefile) > 0) return built_mfile, num_procs, num_nodes, ranks_per_node #will prob want to adjust based on input @@ -836,7 +859,8 @@ class BalsamJobController(JobController): #controller = None - def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, nodelist_env_cobalt=None): + def __init__(self, registry=None, auto_resources=True, + nodelist_env_slurm=None, nodelist_env_cobalt=None): """Instantiate a new BalsamJobController instance. A new BalsamJobController object is created with an application @@ -866,8 +890,9 @@ def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, #BalsamJobController.controller = self - def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, - machinefile=None, app_args=None, stdout=None, stage_inout=None, test=False, hyperthreads=False): + def launch(self, calc_type, num_procs=None, num_nodes=None, + ranks_per_node=None, machinefile=None, app_args=None, + stdout=None, stage_inout=None, test=False, hyperthreads=False): """Creates a new job, and either launches or schedules to launch in the job controller From 8676b96f18fecb928f7a12674cbbcbe60db87f18 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 17:50:25 -0500 Subject: [PATCH 024/101] Switched MPI command construction to use launcher.form_command. --- libensemble/controller.py | 69 ++++++++++----------------------------- 1 file changed, 17 insertions(+), 52 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index a99e981e5..7e3456b51 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -17,6 +17,8 @@ import signal import itertools import time + +import libensemble.launcher as launcher from libensemble.register import Register from libensemble.resources import Resources @@ -275,28 +277,21 @@ def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=nodelist_env_slurm, nodelist_env_cobalt=nodelist_env_cobalt) - #logger.debug("top_level_dir is {}".format(self.top_level_dir)) - #todo Configure by autodetection - #In fact it will be a sub-object - most likely with inhertience - based on detection or specification - #Also the construction of the run-line itself will prob. be a function of that object - #For now though - do like this: - - mpi_variant = Resources.get_MPI_variant() - if mpi_variant == 'mpich': - self.mpi_launcher = 'mpirun' - self.mfile = '-machinefile' - self.nprocs = '-np' - self.nnodes = '' - self.ppn = '--ppn' - self.hostlist = '-hosts' - elif mpi_variant == 'openmpi': - self.mpi_launcher = 'mpirun' - self.mfile = '-machinefile' - self.nprocs = '-np' - self.nnodes = '' - self.ppn = '-npernode' - self.hostlist = '-host' + #In fact it will be a sub-object - most likely with inheritance - based + #on detection or specification. Also the construction of the run-line + #itself will prob. be a function of that object. For now though - do + #like this: + mpi_commands = { + 'mpich': ['mpirun', '--env {env}', '-machinefile {machinefile}', + '-hosts {hostlist}', '-np {num_procs}', + '--ppn {ranks_per_node}'], + 'openmpi': ['mpirun', '-x {env}', '-machinefile {machinefile}', + '-host {hostlist}', '-np {num_procs}', + '-npernode {ranks_per_node}'], + } + self.mpi_command = mpi_commands[Resources.get_MPI_variant()] + #self.mpi_launcher = 'srun' #self.mfile = '-m arbitrary' #self.nprocs = '--ntasks' @@ -433,38 +428,8 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, if stage_inout is not None: logger.warning('stage_inout option ignored in this job_controller - runs in-place') - #Construct run line - possibly subroutine - runline = [self.mpi_launcher] - - if job.machinefile is not None: - #os.environ['SLURM_HOSTFILE'] = job.machinefile - runline.append(self.mfile) - runline.append(job.machinefile) - - #Should be else - if machine file - dont need any other config - - if job.hostlist is not None: - #os.environ['SLURM_HOSTFILE'] = job.machinefile - runline.append(self.hostlist) - runline.append(job.hostlist) - - if job.num_procs is not None: - runline.append(self.nprocs) - runline.append(str(job.num_procs)) - - #Not currently setting nodes - #- as not always supported - but should always have the other two after calling _job_partition - #if job.num_nodes is not None: - #runline.append(self.nnodes) - #runline.append(str(job.num_nodes)) - - #Currently issues - command depends on mpich/openmpi etc... - if job.ranks_per_node is not None: - runline.append(self.ppn) - runline.append(str(job.ranks_per_node)) - + runline = launcher.form_command(self.mpi_command, vars(job)) runline.append(job.app.full_path) - if job.app_args is not None: runline.extend(job.app_args.split()) From d6d8dc40ddbbbe05ffd0326eb9ff55e8f5ee84e8 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 17:54:03 -0500 Subject: [PATCH 025/101] Cleanup dead code. --- libensemble/controller.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index 7e3456b51..bae2b5668 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -616,20 +616,6 @@ def kill(self, job): JobController._kill_process(job.process, signal.SIGKILL) job.process.wait() - #Using subprocess timeout attribute where available (py3) - #try: - #job.process.wait(timeout=self.wait_time) - ##stdout,stderr = self.process.communicate(timeout=self.wait_time) #Wait for process to finish - #except TypeError: #eg. Python2 - ##logger.warning("TimeoutExpired not supported in this version of Python. Issuing SIGKILL to job {}".format(job.name)) - #if JobController._time_out(job.process, self.wait_time): - #logger.warning("Kill signal {} timed out for job {}: Issuing SIGKILL".format(self.kill_signal, job.name)) - #JobController._kill_process(job.process, signal.SIGKILL) - #job.process.wait() - #except subprocess.TimeoutExpired: - #logger.warning("Kill signal {} timed out for job {}: Issuing SIGKILL".format(self.kill_signal, job.name)) - #JobController._kill_process(job.process, signal.SIGKILL) - #job.process.wait() else: job.process.wait() From eafc2a10d233bc741dc2afb74d197e97383a0200 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 18:27:04 -0500 Subject: [PATCH 026/101] Minor formatting cleanup. --- libensemble/controller.py | 55 ++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index bae2b5668..9af510ca4 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -12,7 +12,6 @@ """ import os -import subprocess import logging import signal import itertools @@ -92,7 +91,7 @@ def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, jassert(app is not None, "Job must be created with an app - no app found for job {}". - format(self.id)) + format(self.id)) worker_name = "_worker{}".format(self.workerID) if self.workerID else "" self.name = "job_{}{}_{}".format(app.name, worker_name, self.id) @@ -105,13 +104,15 @@ def workdir_exists(self): def file_exists_in_workdir(self, filename): """Returns True if the named file exists in the job's workdir""" - return self.workdir and os.path.exists(os.path.join(self.workdir, filename)) + return (self.workdir + and os.path.exists(os.path.join(self.workdir, filename))) def read_file_in_workdir(self, filename): """Open and reads the named file in the job's workdir """ path = os.path.join(self.workdir, filename) if not os.path.exists(path): - raise ValueError("{} not found in working directory".format(filename)) + raise ValueError("{} not found in working directory". + format(filename)) with open(path) as f: return f.read() @@ -217,7 +218,7 @@ def job_partition(num_procs, num_nodes, ranks_per_node, machinefile=None): if not num_procs: jassert(num_nodes and ranks_per_node, - "Must set either num_procs or num_nodes/ranks_per_node or machinefile") + "Need num_procs, num_nodes/ranks_per_node, or machinefile") num_procs = num_nodes * ranks_per_node elif not num_nodes: @@ -398,9 +399,13 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, hostlist = None if machinefile is None and self.auto_resources: - #klugging this for now - not nec machinefile if more than one node - try a hostlist + #kludging this for now - not nec machinefile if more than one node + #- try a hostlist - num_procs, num_nodes, ranks_per_node = self.get_resources(num_procs=num_procs, num_nodes=num_nodes, ranks_per_node=ranks_per_node, hyperthreads=hyperthreads) + num_procs, num_nodes, ranks_per_node = \ + self.get_resources(num_procs=num_procs, num_nodes=num_nodes, + ranks_per_node=ranks_per_node, + hyperthreads=hyperthreads) if num_nodes > 1: #hostlist @@ -424,7 +429,8 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, job = Job(app, app_args, num_procs, num_nodes, ranks_per_node, machinefile, hostlist, default_workdir, stdout, self.workerID) - #Temporary perhaps - though when create workdirs - will probably keep output in place + #Temporary perhaps - though when create workdirs - will probably keep + #output in place if stage_inout is not None: logger.warning('stage_inout option ignored in this job_controller - runs in-place') @@ -436,28 +442,20 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, if test: logger.info('Test selected: Not launching job') logger.info('runline args are {}'.format(runline)) - #print('runline args are', runline) - #print('stdout to', stdout) - #logger.info(runline) else: - logger.debug("Launching job {}: {}".format(job.name, " ".join(runline))) #One line - #logger.debug("Launching job {}:\n{}{}".format(job.name, " "*32, " ".join(runline))) #With newline + logger.debug("Launching job {}: {}". + format(job.name, " ".join(runline))) #One line - #not good for timing job itself as dont know when finishes - if use this prob. change to date time or - #use for timeout. For now using for timing with approx end.... + #not good for timing job itself as dont know when finishes - if use + #this prob. change to date time or use for timeout. For now using + #for timing with approx end.... job.launch_time = time.time() - #job.process = subprocess.Popen(runline, cwd='./', stdout = open(job.stdout,'w'), shell=False) - - job.process = subprocess.Popen(runline, cwd='./', stdout=open(job.stdout, 'w'), shell=False, preexec_fn=os.setsid) - - - #To test when have workdir - #job.process = subprocess.Popen(runline, cwd=job.workdir, stdout = open(job.stdout,'w'), shell=False) - + job.process = launcher.launch(runline, cwd='./', + stdout=open(job.stdout, 'w'), + start_new_session=True) self.list_of_jobs.append(job) - #return job.id return job @@ -478,11 +476,13 @@ def poll(self, job): #Prob should be recoverable and return state - but currently fatal jassert(job.process is not None, "Polled job {} has no process ID - check jobs been launched". - format(job.name)) + format(job.name)) # Do not poll if job already finished # Maybe should re-poll job to check (in case self.finished set in error!)??? if job.finished: - logger.warning('Polled job {} has already finished. Not re-polling. Status is {}'.format(job.name, job.state)) + logger.warning("Polled job {} has already finished. " + "Not re-polling. Status is {}". + format(job.name, job.state)) return #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# @@ -715,7 +715,7 @@ def get_resources(self, num_procs=None, num_nodes=None, logger.debug("No decomposition supplied - " "using all available resource. " "Nodes: {} ranks_per_node {}". - format(num_nodes, ranks_per_node)) + format(num_nodes, ranks_per_node)) elif not num_nodes and not ranks_per_node: num_nodes = local_node_count #Here is where really want a compact/scatter option - go for @@ -841,6 +841,7 @@ def __init__(self, registry=None, auto_resources=True, #BalsamJobController.controller = self + # DSB: swaps test and hyperthreads vs base JobController def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, app_args=None, stdout=None, stage_inout=None, test=False, hyperthreads=False): From 7c33954ed061bd933d40c0a16b1281dd27b50c77 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 18:47:57 -0500 Subject: [PATCH 027/101] Switch explicit signal logic for launcher.cancel call. --- libensemble/controller.py | 47 +++++++-------------------------------- libensemble/launcher.py | 2 ++ 2 files changed, 10 insertions(+), 39 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index 9af510ca4..a6e029146 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -13,7 +13,6 @@ import os import logging -import signal import itertools import time @@ -548,27 +547,6 @@ def manager_poll(self, job): logger.warning("Received unrecognized manager signal {} - ignoring".format(man_signal)) - @staticmethod - def _kill_process(process, signal): - """Launch the process kill for this system""" - time.sleep(0.1) # Without a small wait - kill signal can not work - os.killpg(os.getpgid(process.pid), signal) # Kill using process group (see launch with preexec_fn=os.setsid) - - #process.send_signal(signal) # Kill by sending direct signal - - # Just for you, python2 - @staticmethod - def _time_out(process, timeout): - """Loop to wait for process to finish after a kill""" - start_wait_time = time.time() - while time.time() - start_wait_time < timeout: - time.sleep(0.01) - poll = process.poll() - if poll is not None: - return False # process has finished - no timeout - return True # process has not finished - timeout - - def kill(self, job): """Kills or cancels the supplied job @@ -599,25 +577,16 @@ def kill(self, job): logger.debug("Killing job {}".format(job.name)) - # Issue signal - sig = {'SIGTERM': signal.SIGTERM, 'SIGKILL': signal.SIGKILL} - jassert(self.kill_signal in sig, "Unknown kill signal") - try: - JobController._kill_process(job.process, sig[self.kill_signal]) - except OSError: # In Python 3, ProcessLookupError - logger.warning("Tried to kill job {}. Process {} not found. May have finished".format(job.name, job.process.pid)) + jassert(self.kill_signal in ['SIGTERM', 'SIGKILL'], + "Unknown kill signal") - # Wait for job to be killed - if self.wait_and_kill: + timeout = 0 # Default is to just kill and wait + if self.kill_signal == 'SIGTERM': # For a graceful kill + timeout = None # Terminate and just wait + if self.wait_and_kill: # Or if we want to wait and kill... + timeout = self.wait_time # Set a timeout - # My python2 method works ok for py2 and py3 - if JobController._time_out(job.process, self.wait_time): - logger.warning("Kill signal {} timed out for job {}: Issuing SIGKILL".format(self.kill_signal, job.name)) - JobController._kill_process(job.process, signal.SIGKILL) - job.process.wait() - - else: - job.process.wait() + launcher.cancel(job.process, timeout) job.state = 'USER_KILLED' job.finished = True diff --git a/libensemble/launcher.py b/libensemble/launcher.py index 8ad329e79..a925bd904 100644 --- a/libensemble/launcher.py +++ b/libensemble/launcher.py @@ -68,6 +68,8 @@ def terminatepg(process): pid = process.pid pgid = os.getpgid(pid) if hasattr(os, 'killpg') else -1 if pgid == pid: + # Claim in original controller was that we need a wait -- + # have not seen this anywhere else in my searching... os.killpg(pgid, signal.SIGTERM) elif hasattr(signal, 'CTRL_BREAK_EVENT'): # Supposedly does a group terminate for Windows... From c023255f65b9576662a11c8057ef104eda924987 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 19:12:43 -0500 Subject: [PATCH 028/101] Clean up JobController.poll (+ clean up some formatting). --- libensemble/controller.py | 100 ++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 52 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index a6e029146..ba0773f0b 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -61,8 +61,9 @@ def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, workdir=None, stdout=None, workerid=None): """Instantiate a new Job instance. - A new job object is created with an id, status and configuration attributes - This will normally be created by the job_controller on a launch + A new job object is created with an id, status and configuration + attributes. This will normally be created by the job_controller + on a launch """ self.id = next(Job.newid) @@ -70,7 +71,7 @@ def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, self.state = 'CREATED' #: test1 docstring self.process = None self.errcode = None - self.finished = False # True means job has run - not whether was successful + self.finished = False # True means job ran, not that it succeeded self.success = False self.launch_time = None self.runtime = None @@ -123,9 +124,9 @@ def read_stdout(self): """Open and reads the job's stdout file in the job's workdir""" return self.read_file_in_workdir(self.stdout) - #Note - this is currently only final job-time. May make running job time. - #And prob want to use for polling in sim func - esp in balsam - where want acutal runtime not time since launch + #And prob want to use for polling in sim func - esp in balsam - + #where want acutal runtime not time since launch def calc_job_timing(self): """Calculate timing information for this job""" if self.launch_time is None: @@ -148,8 +149,6 @@ class BalsamJob(Job): """ - #newid = itertools.count() #hopefully can use the one in Job - def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, hostlist=None, workdir=None, stdout=None, workerid=None): @@ -207,12 +206,14 @@ class JobController: @staticmethod def job_partition(num_procs, num_nodes, ranks_per_node, machinefile=None): - """ Takes provided nprocs/nodes/ranks and outputs working configuration of procs/nodes/ranks or error """ + """Takes provided nprocs/nodes/ranks and outputs working + configuration of procs/nodes/ranks or error""" #If machinefile is provided - ignore everything else if machinefile: if num_procs or num_nodes or ranks_per_node: - logger.warning('Machinefile provided - overriding procs/nodes/ranks_per_node') + logger.warning("Machinefile provided - overriding " + "procs/nodes/ranks_per_node") return None, None, None if not num_procs: @@ -292,20 +293,14 @@ def __init__(self, registry=None, auto_resources=True, } self.mpi_command = mpi_commands[Resources.get_MPI_variant()] - #self.mpi_launcher = 'srun' - #self.mfile = '-m arbitrary' - #self.nprocs = '--ntasks' - #self.nnodes = '--nodes' - #self.ppn = '--ntasks-per-node' - #self.hostlist = '-w' - #Job controller settings - can be set in user function. self.kill_signal = 'SIGTERM' - self.wait_and_kill = True #If true - wait for wait_time after signal and then kill with SIGKILL + self.wait_and_kill = True self.wait_time = 60 #list_of_jobs: Need to decide on reset... - reset for each calc? - #and how link to libe job (or calc) class - if reset for each calc - could store this in job + #and how link to libe job (or calc) class - if reset for each calc - + #could store this in job self.list_of_jobs = [] self.workerID = None @@ -315,7 +310,8 @@ def __init__(self, registry=None, auto_resources=True, #self.resources = Resources(top_level_dir = self.top_level_dir) - #If this could share multiple launches could set default job parameters here (nodes/ranks etc...) + #If this could share multiple launches could set default job parameters + #here (nodes/ranks etc...) # May change job_controller launch functions to use **kwargs and then init job empty - and use setattr @@ -386,7 +382,7 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node are provided, these will be honored if possible. If resource detection is on and these are omitted, then the available resources will be divided amongst workers. -""" + """ app = self.registry.default_app(calc_type) jassert(calc_type in ['sim', 'gen'], @@ -431,7 +427,8 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, #Temporary perhaps - though when create workdirs - will probably keep #output in place if stage_inout is not None: - logger.warning('stage_inout option ignored in this job_controller - runs in-place') + logger.warning("stage_inout option ignored in this " + "job_controller - runs in-place") runline = launcher.form_command(self.mpi_command, vars(job)) runline.append(job.app.full_path) @@ -477,7 +474,7 @@ def poll(self, job): "Polled job {} has no process ID - check jobs been launched". format(job.name)) # Do not poll if job already finished - # Maybe should re-poll job to check (in case self.finished set in error!)??? + # Maybe should re-poll to check (in case self.finished set in error!)??? if job.finished: logger.warning("Polled job {} has already finished. " "Not re-polling. Status is {}". @@ -490,29 +487,18 @@ def poll(self, job): poll = job.process.poll() if poll is None: job.state = 'RUNNING' - else: - job.finished = True - #logger.debug("Process {} Completed".format(job.process)) + return - job.calc_job_timing() + job.finished = True + job.calc_job_timing() + + # Want to be more fine-grained about non-success (fail vs user kill?) + job.errcode = job.process.returncode + job.success = (job.errcode == 0) + job.state = 'FINISHED' if job.success else 'FAILED' + logger.debug("Job {} completed with errcode {} ({})". + format(job.name, job.errcode, job.state)) - if job.process.returncode == 0: - job.success = True - job.errcode = 0 - #logger.debug("Process {} completed successfully".format(job.process)) - logger.debug("Job {} completed successfully".format(job.name)) - job.state = 'FINISHED' - else: - #Need to differentiate failure from if job was user-killed !!!! What if remotely??? - #If this process killed the job it will already be set and if not re-polling will not get here. - #But could query existing state here as backup?? - Also may add a REMOTE_KILL state??? - #Not yet remote killing so assume failed.... - job.errcode = job.process.returncode - logger.debug("Job {} failed".format(job.name)) - job.state = 'FAILED' - - #Just updates job as provided - #return job def manager_poll(self, job): """ Polls for a manager signal @@ -529,7 +515,8 @@ def manager_poll(self, job): """ #Will use MPI_MODE from settings.py but for now assume MPI - from libensemble.message_numbers import STOP_TAG, MAN_SIGNAL_FINISH, MAN_SIGNAL_KILL + from libensemble.message_numbers import \ + STOP_TAG, MAN_SIGNAL_FINISH, MAN_SIGNAL_KILL from mpi4py import MPI # Manager Signals @@ -544,7 +531,8 @@ def manager_poll(self, job): elif man_signal == MAN_SIGNAL_KILL: job.manager_signal = 'kill' else: - logger.warning("Received unrecognized manager signal {} - ignoring".format(man_signal)) + logger.warning("Received unrecognized manager signal {} - " + "ignoring".format(man_signal)) def kill(self, job): @@ -567,13 +555,15 @@ def kill(self, job): jassert(isinstance(job, Job), "Invalid job has been provided") if job.finished: - logger.warning('Trying to kill job that is no longer running. Job {}: Status is {}'.format(job.name, job.state)) + logger.warning("Trying to kill job that is no longer running. " + "Job {}: Status is {}".format(job.name, job.state)) return if job.process is None: time.sleep(0.2) jassert(job.process is not None, - "Attempting to kill job {} that has no process ID - check jobs been launched".format(job.name)) + "Attempting to kill job {} that has no process ID - " + "check jobs been launched".format(job.name)) logger.debug("Killing job {}".format(job.name)) @@ -615,7 +605,9 @@ def set_kill_mode(self, signal=None, wait_and_kill=None, wait_time=None): sending a SIGKILL when wait_and_kill is set. (Default is 60). """ if signal is not None: - jassert(signal in SIGNALS, "Unknown signal {} supplied to set_kill_mode".format(signal)) + jassert(signal in SIGNALS, + "Unknown signal {} supplied to set_kill_mode". + format(signal)) self.kill_signal = signal if wait_and_kill is not None: @@ -624,7 +616,7 @@ def set_kill_mode(self, signal=None, wait_and_kill=None, wait_time=None): if wait_time is not None: self.wait_time = wait_time if not wait_and_kill: - logger.warning('wait_time set but will have no effect while wait_and_kill is False') + logger.warning('wait_time does nothing if wait_and_kill=False') def get_job(self, jobid): @@ -635,7 +627,8 @@ def get_job(self, jobid): return job logger.warning("Job {} not found in joblist".format(jobid)) return None - logger.warning("Job {} not found in joblist. Joblist is empty".format(jobid)) + logger.warning("Job {} not found in joblist. Joblist is empty". + format(jobid)) return None @@ -723,7 +716,9 @@ def get_resources(self, num_procs=None, num_nodes=None, format(num_procs, cores_avail_per_node*local_node_count)) if num_nodes < local_node_count: - logger.warning("User constraints mean fewer nodes being used than available. {} nodes used. {} nodes available".format(num_nodes, local_node_count)) + logger.warning("User constraints mean fewer nodes being used " + "than available. {} nodes used. {} nodes available". + format(num_nodes, local_node_count)) return num_procs, num_nodes, ranks_per_node @@ -899,7 +894,8 @@ def poll(self, job): # Do not poll if job already finished if job.finished: - logger.warning('Polled job has already finished. Not re-polling. Status is {}'.format(job.state)) + logger.warning("Polled job has already finished. Not re-polling. " + "Status is {}".format(job.state)) return #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# From 82590d0733f644fe520ac5efa4a16928a875aa7a Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 20:39:47 -0500 Subject: [PATCH 029/101] Add back sleep to prevent kill before launch (this is a bit bogus). --- libensemble/launcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libensemble/launcher.py b/libensemble/launcher.py index a925bd904..b9204667d 100644 --- a/libensemble/launcher.py +++ b/libensemble/launcher.py @@ -70,6 +70,7 @@ def terminatepg(process): if pgid == pid: # Claim in original controller was that we need a wait -- # have not seen this anywhere else in my searching... + time.sleep(0.1) os.killpg(pgid, signal.SIGTERM) elif hasattr(signal, 'CTRL_BREAK_EVENT'): # Supposedly does a group terminate for Windows... From 6ee8435bc3c5f8876adf96199c9323c6f73cef6b Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 20:46:34 -0500 Subject: [PATCH 030/101] Try check poll before giving up on kill. --- libensemble/launcher.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/libensemble/launcher.py b/libensemble/launcher.py index b9204667d..8ee8454a5 100644 --- a/libensemble/launcher.py +++ b/libensemble/launcher.py @@ -51,6 +51,8 @@ def launch(cmd_template, specs=None, **kwargs): def killpg(process): "Kill the process (and group if it is group leader)." try: + if process.poll() is not None: + return False pid = process.pid pgid = os.getpgid(pid) if hasattr(os, 'killpg') else -1 if pgid == pid: @@ -65,12 +67,14 @@ def killpg(process): def terminatepg(process): "Send termination signal to the process (and group if it is group leader)" try: + if process.poll() is not None: + return False pid = process.pid pgid = os.getpgid(pid) if hasattr(os, 'killpg') else -1 if pgid == pid: # Claim in original controller was that we need a wait -- # have not seen this anywhere else in my searching... - time.sleep(0.1) + #time.sleep(0.1) os.killpg(pgid, signal.SIGTERM) elif hasattr(signal, 'CTRL_BREAK_EVENT'): # Supposedly does a group terminate for Windows... From f648f350c25668d2df8dfc0eaa1e964fe63a4eac Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 21:33:33 -0500 Subject: [PATCH 031/101] Remove extraneous poll added at start of killpg / terminatepg. --- libensemble/launcher.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/libensemble/launcher.py b/libensemble/launcher.py index 8ee8454a5..a02c592a7 100644 --- a/libensemble/launcher.py +++ b/libensemble/launcher.py @@ -51,8 +51,6 @@ def launch(cmd_template, specs=None, **kwargs): def killpg(process): "Kill the process (and group if it is group leader)." try: - if process.poll() is not None: - return False pid = process.pid pgid = os.getpgid(pid) if hasattr(os, 'killpg') else -1 if pgid == pid: @@ -67,8 +65,6 @@ def killpg(process): def terminatepg(process): "Send termination signal to the process (and group if it is group leader)" try: - if process.poll() is not None: - return False pid = process.pid pgid = os.getpgid(pid) if hasattr(os, 'killpg') else -1 if pgid == pid: From b121fccca06def059acfff741a3a946d6f4bc949 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 21:37:50 -0500 Subject: [PATCH 032/101] Uncomment command to shorten timeout for kill in jobcontroller test_doublekill. --- libensemble/tests/unit_tests/test_jobcontroller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/unit_tests/test_jobcontroller.py b/libensemble/tests/unit_tests/test_jobcontroller.py index 9b67ca2a7..010d370c8 100644 --- a/libensemble/tests/unit_tests/test_jobcontroller.py +++ b/libensemble/tests/unit_tests/test_jobcontroller.py @@ -331,7 +331,7 @@ def test_doublekill(): args_for_sim = 'sleep 2.0' job = jobctl.launch(calc_type='sim', num_procs=cores, app_args=args_for_sim) jobctl.poll(job) - #jobctl.set_kill_mode(wait_and_kill=True, wait_time=5) + jobctl.set_kill_mode(wait_and_kill=True, wait_time=5) jobctl.kill(job) assert job.finished, "job.finished should be True. Returned " + str(job.finished) From a9dcf1156537c8bbe36653a754a8efd290e08cbe Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 21:43:05 -0500 Subject: [PATCH 033/101] Change order of hyperthreads and test arguments in BalsamController. --- libensemble/controller.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index ba0773f0b..60c50547c 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -805,10 +805,9 @@ def __init__(self, registry=None, auto_resources=True, #BalsamJobController.controller = self - # DSB: swaps test and hyperthreads vs base JobController def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, app_args=None, - stdout=None, stage_inout=None, test=False, hyperthreads=False): + stdout=None, stage_inout=None, hyperthreads=False, test=False): """Creates a new job, and either launches or schedules to launch in the job controller From 8644e7cf4efafc3f7dd079226771a1e88131b2fe Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 21:46:53 -0500 Subject: [PATCH 034/101] Removed comment about sigterm wait. --- libensemble/launcher.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/libensemble/launcher.py b/libensemble/launcher.py index a02c592a7..8ad329e79 100644 --- a/libensemble/launcher.py +++ b/libensemble/launcher.py @@ -68,9 +68,6 @@ def terminatepg(process): pid = process.pid pgid = os.getpgid(pid) if hasattr(os, 'killpg') else -1 if pgid == pid: - # Claim in original controller was that we need a wait -- - # have not seen this anywhere else in my searching... - #time.sleep(0.1) os.killpg(pgid, signal.SIGTERM) elif hasattr(signal, 'CTRL_BREAK_EVENT'): # Supposedly does a group terminate for Windows... From c2f27cdf76905810b9049b7af3b0c47dfe376368 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 22:35:26 -0500 Subject: [PATCH 035/101] Split Balsam controller into its own module. --- libensemble/balsam_controller.py | 278 +++++++++++++++ libensemble/controller.py | 320 +----------------- .../test_jobcontroller.manager_poll.py | 1 + .../controller_tests/test_jobcontroller.py | 1 + .../test_jobcontroller_multi.py | 3 +- .../test_jobcontroller_hworld.py | 3 +- .../tests/unit_tests/test_jobcontroller.py | 3 +- 7 files changed, 294 insertions(+), 315 deletions(-) create mode 100644 libensemble/balsam_controller.py diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py new file mode 100644 index 000000000..9e094d21e --- /dev/null +++ b/libensemble/balsam_controller.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python + +""" +Module to launch and control running jobs. + +Contains job_controller, job, and inherited classes. A job_controller can +create and manage multiple jobs. The worker or user-side code can issue +and manage jobs using the launch, poll and kill functions. Job attributes +are queried to determine status. Functions are also provided to access +and interrogate files in the job's working directory. + +""" + +import os +import logging +import itertools +import time + +import libensemble.launcher as launcher +from libensemble.register import Register +from libensemble.resources import Resources +from libensemble.controller import \ + Job, JobController, JobControllerException, jassert, STATES, SIGNALS + +logger = logging.getLogger(__name__ + '(' + Resources.get_my_name() + ')') +#For debug messages in this module - uncomment (see libE.py to change root logging level) +#logger.setLevel(logging.DEBUG) + + +class BalsamJob(Job): + """Wraps a Balsam Job from the Balsam service. + + The same attributes and query routines are implemented. + + """ + + def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, + ranks_per_node=None, machinefile=None, hostlist=None, + workdir=None, stdout=None, workerid=None): + """Instantiate a new BalsamJob instance. + + A new BalsamJob object is created with an id, status and + configuration attributes. This will normally be created by the + job_controller on a launch. + """ + + super().__init__(app, app_args, num_procs, num_nodes, ranks_per_node, + machinefile, hostlist, workdir, stdout, workerid) + + self.balsam_state = None + + #prob want to override workdir attribute with Balsam value - + #though does it exist yet? + #self.workdir = None #Don't know until starts running + self.workdir = workdir #Default for libe now is to run in place. + + + def read_file_in_workdir(self, filename): + return self.process.read_file_in_workdir(filename) + + def read_stdout(self): + return self.process.read_file_in_workdir(self.stdout) + + def calc_job_timing(self): + """Calculate timing information for this job""" + + #Get runtime from Balsam + self.runtime = self.process.runtime_seconds + + if self.launch_time is None: + logger.warning("Cannot calc job total_time - launch time not set") + return + + if self.total_time is None: + self.total_time = time.time() - self.launch_time + + +class BalsamJobController(JobController): + """Inherits from JobController and wraps the Balsam job management service + + .. note:: Job kills are currently not configurable in the Balsam job_controller. + + The set_kill_mode function will do nothing but print a warning. + + """ + + #controller = None + + def __init__(self, registry=None, auto_resources=True, + nodelist_env_slurm=None, nodelist_env_cobalt=None): + """Instantiate a new BalsamJobController instance. + + A new BalsamJobController object is created with an application + registry and configuration attributes + """ + + #Will use super - atleast if use baseclass - but for now dont want to set self.mpi_launcher etc... + + self.registry = registry or Register.default_registry + jassert(self.registry, "Cannot find default registry") + + self.top_level_dir = os.getcwd() + self.auto_resources = auto_resources + + if self.auto_resources: + self.resources = Resources(top_level_dir=self.top_level_dir, central_mode=True, + nodelist_env_slurm=nodelist_env_slurm, + nodelist_env_cobalt=nodelist_env_cobalt) + + #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# + + self.list_of_jobs = [] #Why did I put here? Will inherit + + #self.auto_machinefile = False #May in future use the auto_detect part though - to fill in procs/nodes/ranks_per_node + + JobController.controller = self + #BalsamJobController.controller = self + + + def launch(self, calc_type, num_procs=None, num_nodes=None, + ranks_per_node=None, machinefile=None, app_args=None, + stdout=None, stage_inout=None, hyperthreads=False, test=False): + """Creates a new job, and either launches or schedules to launch + in the job controller + + The created job object is returned. + """ + import balsam.launcher.dag as dag + + app = self.registry.default_app(calc_type) + jassert(calc_type in ['sim', 'gen'], + "Unrecognized calculation type", calc_type) + jassert(app, "Default {} app is not set".format(calc_type)) + + #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# + + #Need test somewhere for if no breakdown supplied.... or only machinefile + + #Specific to this class + if machinefile is not None: + logger.warning("machinefile arg ignored - not supported in Balsam") + jassert(num_procs or num_nodes or ranks_per_node, + "No procs/nodes provided - aborting") + + + #Set num_procs, num_nodes and ranks_per_node for this job + + #Without resource detection + #num_procs, num_nodes, ranks_per_node = JobController.job_partition(num_procs, num_nodes, ranks_per_node) #Note: not included machinefile option + + #With resource detection (may do only if under-specified?? though that will not tell if larger than possible + #for static allocation - but Balsam does allow dynamic allocation if too large!! + #For now allow user to specify - but default is True.... + if self.auto_resources: + num_procs, num_nodes, ranks_per_node = self.get_resources(num_procs=num_procs, num_nodes=num_nodes, ranks_per_node=ranks_per_node, hyperthreads=hyperthreads) + else: + #Without resource detection + num_procs, num_nodes, ranks_per_node = JobController.job_partition(num_procs, num_nodes, ranks_per_node) #Note: not included machinefile option + + #temp - while balsam does not accept a standard out name + if stdout is not None: + logger.warning("Balsam does not currently accept a stdout name - ignoring") + stdout = None + + #Will be possible to override with arg when implemented (or can have option to let Balsam assign) + default_workdir = os.getcwd() + + hostlist = None + job = BalsamJob(app, app_args, num_procs, num_nodes, ranks_per_node, machinefile, hostlist, default_workdir, stdout, self.workerID) + + #This is not used with Balsam for run-time as this would include wait time + #Again considering changing launch to submit - or whatever I chose before..... + job.launch_time = time.time() #Not good for timing job - as I dont know when it finishes - only poll/kill est. + + add_job_args = {'name': job.name, + 'workflow': "libe_workflow", #add arg for this + 'user_workdir': default_workdir, #add arg for this + 'application': app.name, + 'args': job.app_args, + 'num_nodes': job.num_nodes, + 'ranks_per_node': job.ranks_per_node} + + if stage_inout is not None: + #For now hardcode staging - for testing + add_job_args['stage_in_url'] = "local:" + stage_inout + "/*" + add_job_args['stage_out_url'] = "local:" + stage_inout + add_job_args['stage_out_files'] = "*.out" + + job.process = dag.add_job(**add_job_args) + + logger.debug("Added job to Balsam database {}: Worker {} nodes {} ppn {}".format(job.name, self.workerID, job.num_nodes, job.ranks_per_node)) + + #job.workdir = job.process.working_directory #Might not be set yet!!!! + self.list_of_jobs.append(job) + return job + + + def poll(self, job): + """Polls and updates the status attributes of the supplied job""" + jassert(isinstance(job, BalsamJob), "Invalid job has been provided") + + # Check the jobs been launched (i.e. it has a process ID) + #Prob should be recoverable and return state - but currently fatal + jassert(job.process, "Polled job has no process ID - check jobs been launched") + + # Do not poll if job already finished + if job.finished: + logger.warning("Polled job has already finished. Not re-polling. " + "Status is {}".format(job.state)) + return + + #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# + + # Get current state of jobs from Balsam database + job.process.refresh_from_db() + job.balsam_state = job.process.state #Not really nec to copy have balsam_state - already job.process.state... + #logger.debug('balsam_state for job {} is {}'.format(job.id, job.balsam_state)) + + import balsam.launcher.dag as dag #Might need this before get models - test + from balsam.service import models + + if job.balsam_state in models.END_STATES: + job.finished = True + + job.calc_job_timing() + + if job.workdir is None: + job.workdir = job.process.working_directory + if job.balsam_state == 'JOB_FINISHED': + job.success = True + job.state = 'FINISHED' + elif job.balsam_state == 'PARENT_KILLED': #I'm not using this currently + job.state = 'USER_KILLED' + #job.success = False #Shld already be false - init to false + #job.errcode = #Not currently returned by Balsam API - requested - else will remain as None + elif job.balsam_state in STATES: #In my states + job.state = job.balsam_state + #job.success = False #All other end states are failrues currently - bit risky + #job.errcode = #Not currently returned by Balsam API - requested - else will remain as None + else: + logger.warning("Job finished, but in unrecognized Balsam state {}".format(job.balsam_state)) + job.state = 'UNKNOWN' + + elif job.balsam_state in models.ACTIVE_STATES: + job.state = 'RUNNING' + if job.workdir is None: + job.workdir = job.process.working_directory + + elif job.balsam_state in models.PROCESSABLE_STATES + models.RUNNABLE_STATES: #Does this work - concatenate lists + job.state = 'WAITING' + else: + raise JobControllerException('Job state returned from Balsam is not in known list of Balsam states. Job state is {}'.format(job.balsam_state)) + + # DSB: With this commented out, number of return args is inconsistent (returns job above) + #return job + + def kill(self, job): + """ Kills or cancels the supplied job """ + + jassert(isinstance(job, BalsamJob), "Invalid job has been provided") + + import balsam.launcher.dag as dag + dag.kill(job.process) + + #Could have Wait here and check with Balsam its killed - but not implemented yet. + + job.state = 'USER_KILLED' + job.finished = True + job.calc_job_timing() + + #Check if can wait for kill to complete - affect signal used etc.... + + def set_kill_mode(self, signal=None, wait_and_kill=None, wait_time=None): + """Not currently implemented for BalsamJobController. + + No action is taken + """ + logger.warning("set_kill_mode currently has no action with Balsam controller") diff --git a/libensemble/controller.py b/libensemble/controller.py index 60c50547c..bed2039a6 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -21,7 +21,8 @@ from libensemble.resources import Resources logger = logging.getLogger(__name__ + '(' + Resources.get_my_name() + ')') -#For debug messages in this module - uncomment (see libE.py to change root logging level) +#For debug messages in this module - uncomment +#(see libE.py to change root logging level) #logger.setLevel(logging.DEBUG) STATES = """ @@ -142,54 +143,6 @@ def calc_job_timing(self): self.total_time = self.runtime -class BalsamJob(Job): - """Wraps a Balsam Job from the Balsam service. - - The same attributes and query routines are implemented. - - """ - - def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, - ranks_per_node=None, machinefile=None, hostlist=None, - workdir=None, stdout=None, workerid=None): - """Instantiate a new BalsamJob instance. - - A new BalsamJob object is created with an id, status and - configuration attributes. This will normally be created by the - job_controller on a launch. - """ - - super().__init__(app, app_args, num_procs, num_nodes, ranks_per_node, - machinefile, hostlist, workdir, stdout, workerid) - - self.balsam_state = None - - #prob want to override workdir attribute with Balsam value - - #though does it exist yet? - #self.workdir = None #Don't know until starts running - self.workdir = workdir #Default for libe now is to run in place. - - - def read_file_in_workdir(self, filename): - return self.process.read_file_in_workdir(filename) - - def read_stdout(self): - return self.process.read_file_in_workdir(self.stdout) - - def calc_job_timing(self): - """Calculate timing information for this job""" - - #Get runtime from Balsam - self.runtime = self.process.runtime_seconds - - if self.launch_time is None: - logger.warning("Cannot calc job total_time - launch time not set") - return - - if self.total_time is None: - self.total_time = time.time() - self.launch_time - - class JobController: """The job_controller can create, poll and kill runnable jobs @@ -278,11 +231,6 @@ def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=nodelist_env_slurm, nodelist_env_cobalt=nodelist_env_cobalt) - #todo Configure by autodetection - #In fact it will be a sub-object - most likely with inheritance - based - #on detection or specification. Also the construction of the run-line - #itself will prob. be a function of that object. For now though - do - #like this: mpi_commands = { 'mpich': ['mpirun', '--env {env}', '-machinefile {machinefile}', '-hosts {hostlist}', '-np {num_procs}', @@ -292,40 +240,13 @@ def __init__(self, registry=None, auto_resources=True, '-npernode {ranks_per_node}'], } self.mpi_command = mpi_commands[Resources.get_MPI_variant()] - - #Job controller settings - can be set in user function. self.kill_signal = 'SIGTERM' self.wait_and_kill = True self.wait_time = 60 - - #list_of_jobs: Need to decide on reset... - reset for each calc? - #and how link to libe job (or calc) class - if reset for each calc - - #could store this in job self.list_of_jobs = [] self.workerID = None - - #self.auto_machinefile = True #Create a machinefile automatically - JobController.controller = self - #self.resources = Resources(top_level_dir = self.top_level_dir) - - #If this could share multiple launches could set default job parameters - #here (nodes/ranks etc...) - - - # May change job_controller launch functions to use **kwargs and then init job empty - and use setattr - #eg. To pass through args: - #def launch(**kwargs): - #... - #job = Job() - #for k,v in kwargs.items(): - #try: - #getattr(job, k) - #except AttributeError: - #raise ValueError(f"Invalid field {}".format(k)) #Unless not passing through all - #else: - #setattr(job, k, v) def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, app_args=None, @@ -390,13 +311,12 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, jassert(app, "Default {} app is not set".format(calc_type)) - #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# + #-------- Up to here should be common - can go in a baseclass ------# hostlist = None if machinefile is None and self.auto_resources: #kludging this for now - not nec machinefile if more than one node #- try a hostlist - num_procs, num_nodes, ranks_per_node = \ self.get_resources(num_procs=num_procs, num_nodes=num_nodes, ranks_per_node=ranks_per_node, @@ -420,12 +340,10 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, JobController.job_partition(num_procs, num_nodes, ranks_per_node, machinefile) - default_workdir = os.getcwd() # May override with arg when implemented + default_workdir = os.getcwd() job = Job(app, app_args, num_procs, num_nodes, ranks_per_node, machinefile, hostlist, default_workdir, stdout, self.workerID) - #Temporary perhaps - though when create workdirs - will probably keep - #output in place if stage_inout is not None: logger.warning("stage_inout option ignored in this " "job_controller - runs in-place") @@ -441,12 +359,7 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, else: logger.debug("Launching job {}: {}". format(job.name, " ".join(runline))) #One line - - #not good for timing job itself as dont know when finishes - if use - #this prob. change to date time or use for timeout. For now using - #for timing with approx end.... job.launch_time = time.time() - job.process = launcher.launch(runline, cwd='./', stdout=open(job.stdout, 'w'), start_new_session=True) @@ -467,21 +380,16 @@ def poll(self, job): """ jassert(isinstance(job, Job), "Invalid job has been provided") - - # Check the jobs been launched (i.e. it has a process ID) - #Prob should be recoverable and return state - but currently fatal jassert(job.process is not None, "Polled job {} has no process ID - check jobs been launched". format(job.name)) - # Do not poll if job already finished - # Maybe should re-poll to check (in case self.finished set in error!)??? if job.finished: logger.warning("Polled job {} has already finished. " "Not re-polling. Status is {}". format(job.name, job.state)) return - #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# + #-------- Up to here should be common - can go in a baseclass ------# # Poll the job poll = job.process.poll() @@ -582,10 +490,6 @@ def kill(self, job): job.finished = True job.calc_job_timing() - #Need to test out what to do with - #job.errcode #Can it be discovered after killing? - #job.success #Could set to false but should be already - only set to true on success - def set_kill_mode(self, signal=None, wait_and_kill=None, wait_time=None): """Configures the kill mode for the job_controller @@ -664,7 +568,8 @@ def get_resources(self, num_procs=None, num_nodes=None, if num_workers > local_node_count: workers_per_node = self.resources.workers_per_node - cores_avail_per_node_per_worker = cores_avail_per_node//workers_per_node + cores_avail_per_node_per_worker = \ + cores_avail_per_node//workers_per_node else: cores_avail_per_node_per_worker = cores_avail_per_node @@ -723,15 +628,12 @@ def get_resources(self, num_procs=None, num_nodes=None, return num_procs, num_nodes, ranks_per_node - def create_machinefile(self, machinefile=None, num_procs=None, num_nodes=None, ranks_per_node=None, hyperthreads=False): """Create a machinefile based on user supplied config options, completed by detected machine resources""" - #Maybe hyperthreads should be mpi_hyperthreads - machinefile = machinefile or 'machinefile' if os.path.isfile(machinefile): try: @@ -740,7 +642,6 @@ def create_machinefile(self, machinefile=None, num_procs=None, pass node_list = self.resources.local_nodelist - logger.debug("Creating machinefile with {} nodes and {} ranks per node". format(num_nodes, ranks_per_node)) @@ -748,219 +649,14 @@ def create_machinefile(self, machinefile=None, num_procs=None, for node in node_list[:num_nodes]: f.write((node + '\n') * ranks_per_node) - #Return true if created and not empty built_mfile = (os.path.isfile(machinefile) and os.path.getsize(machinefile) > 0) return built_mfile, num_procs, num_nodes, ranks_per_node - #will prob want to adjust based on input - #def get_hostlist(self, machinefile=None, num_procs=None, num_nodes=None, ranks_per_node=None, hyperthreads=False): + def get_hostlist(self): """Create a hostlist based on user supplied config options, completed by detected machine resources""" node_list = self.resources.local_nodelist hostlist_str = ",".join([str(x) for x in node_list]) return hostlist_str - - -class BalsamJobController(JobController): - """Inherits from JobController and wraps the Balsam job management service - - .. note:: Job kills are currently not configurable in the Balsam job_controller. - - The set_kill_mode function will do nothing but print a warning. - - """ - - #controller = None - - def __init__(self, registry=None, auto_resources=True, - nodelist_env_slurm=None, nodelist_env_cobalt=None): - """Instantiate a new BalsamJobController instance. - - A new BalsamJobController object is created with an application - registry and configuration attributes - """ - - #Will use super - atleast if use baseclass - but for now dont want to set self.mpi_launcher etc... - - self.registry = registry or Register.default_registry - jassert(self.registry, "Cannot find default registry") - - self.top_level_dir = os.getcwd() - self.auto_resources = auto_resources - - if self.auto_resources: - self.resources = Resources(top_level_dir=self.top_level_dir, central_mode=True, - nodelist_env_slurm=nodelist_env_slurm, - nodelist_env_cobalt=nodelist_env_cobalt) - - #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# - - self.list_of_jobs = [] #Why did I put here? Will inherit - - #self.auto_machinefile = False #May in future use the auto_detect part though - to fill in procs/nodes/ranks_per_node - - JobController.controller = self - #BalsamJobController.controller = self - - - def launch(self, calc_type, num_procs=None, num_nodes=None, - ranks_per_node=None, machinefile=None, app_args=None, - stdout=None, stage_inout=None, hyperthreads=False, test=False): - """Creates a new job, and either launches or schedules to launch - in the job controller - - The created job object is returned. - """ - import balsam.launcher.dag as dag - - app = self.registry.default_app(calc_type) - jassert(calc_type in ['sim', 'gen'], - "Unrecognized calculation type", calc_type) - jassert(app, "Default {} app is not set".format(calc_type)) - - #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# - - #Need test somewhere for if no breakdown supplied.... or only machinefile - - #Specific to this class - if machinefile is not None: - logger.warning("machinefile arg ignored - not supported in Balsam") - jassert(num_procs or num_nodes or ranks_per_node, - "No procs/nodes provided - aborting") - - - #Set num_procs, num_nodes and ranks_per_node for this job - - #Without resource detection - #num_procs, num_nodes, ranks_per_node = JobController.job_partition(num_procs, num_nodes, ranks_per_node) #Note: not included machinefile option - - #With resource detection (may do only if under-specified?? though that will not tell if larger than possible - #for static allocation - but Balsam does allow dynamic allocation if too large!! - #For now allow user to specify - but default is True.... - if self.auto_resources: - num_procs, num_nodes, ranks_per_node = self.get_resources(num_procs=num_procs, num_nodes=num_nodes, ranks_per_node=ranks_per_node, hyperthreads=hyperthreads) - else: - #Without resource detection - num_procs, num_nodes, ranks_per_node = JobController.job_partition(num_procs, num_nodes, ranks_per_node) #Note: not included machinefile option - - #temp - while balsam does not accept a standard out name - if stdout is not None: - logger.warning("Balsam does not currently accept a stdout name - ignoring") - stdout = None - - #Will be possible to override with arg when implemented (or can have option to let Balsam assign) - default_workdir = os.getcwd() - - hostlist = None - job = BalsamJob(app, app_args, num_procs, num_nodes, ranks_per_node, machinefile, hostlist, default_workdir, stdout, self.workerID) - - #This is not used with Balsam for run-time as this would include wait time - #Again considering changing launch to submit - or whatever I chose before..... - job.launch_time = time.time() #Not good for timing job - as I dont know when it finishes - only poll/kill est. - - add_job_args = {'name': job.name, - 'workflow': "libe_workflow", #add arg for this - 'user_workdir': default_workdir, #add arg for this - 'application': app.name, - 'args': job.app_args, - 'num_nodes': job.num_nodes, - 'ranks_per_node': job.ranks_per_node} - - if stage_inout is not None: - #For now hardcode staging - for testing - add_job_args['stage_in_url'] = "local:" + stage_inout + "/*" - add_job_args['stage_out_url'] = "local:" + stage_inout - add_job_args['stage_out_files'] = "*.out" - - job.process = dag.add_job(**add_job_args) - - logger.debug("Added job to Balsam database {}: Worker {} nodes {} ppn {}".format(job.name, self.workerID, job.num_nodes, job.ranks_per_node)) - - #job.workdir = job.process.working_directory #Might not be set yet!!!! - self.list_of_jobs.append(job) - return job - - - def poll(self, job): - """Polls and updates the status attributes of the supplied job""" - jassert(isinstance(job, BalsamJob), "Invalid job has been provided") - - # Check the jobs been launched (i.e. it has a process ID) - #Prob should be recoverable and return state - but currently fatal - jassert(job.process, "Polled job has no process ID - check jobs been launched") - - # Do not poll if job already finished - if job.finished: - logger.warning("Polled job has already finished. Not re-polling. " - "Status is {}".format(job.state)) - return - - #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# - - # Get current state of jobs from Balsam database - job.process.refresh_from_db() - job.balsam_state = job.process.state #Not really nec to copy have balsam_state - already job.process.state... - #logger.debug('balsam_state for job {} is {}'.format(job.id, job.balsam_state)) - - import balsam.launcher.dag as dag #Might need this before get models - test - from balsam.service import models - - if job.balsam_state in models.END_STATES: - job.finished = True - - job.calc_job_timing() - - if job.workdir is None: - job.workdir = job.process.working_directory - if job.balsam_state == 'JOB_FINISHED': - job.success = True - job.state = 'FINISHED' - elif job.balsam_state == 'PARENT_KILLED': #I'm not using this currently - job.state = 'USER_KILLED' - #job.success = False #Shld already be false - init to false - #job.errcode = #Not currently returned by Balsam API - requested - else will remain as None - elif job.balsam_state in STATES: #In my states - job.state = job.balsam_state - #job.success = False #All other end states are failrues currently - bit risky - #job.errcode = #Not currently returned by Balsam API - requested - else will remain as None - else: - logger.warning("Job finished, but in unrecognized Balsam state {}".format(job.balsam_state)) - job.state = 'UNKNOWN' - - elif job.balsam_state in models.ACTIVE_STATES: - job.state = 'RUNNING' - if job.workdir is None: - job.workdir = job.process.working_directory - - elif job.balsam_state in models.PROCESSABLE_STATES + models.RUNNABLE_STATES: #Does this work - concatenate lists - job.state = 'WAITING' - else: - raise JobControllerException('Job state returned from Balsam is not in known list of Balsam states. Job state is {}'.format(job.balsam_state)) - - # DSB: With this commented out, number of return args is inconsistent (returns job above) - #return job - - def kill(self, job): - """ Kills or cancels the supplied job """ - - jassert(isinstance(job, BalsamJob), "Invalid job has been provided") - - import balsam.launcher.dag as dag - dag.kill(job.process) - - #Could have Wait here and check with Balsam its killed - but not implemented yet. - - job.state = 'USER_KILLED' - job.finished = True - job.calc_job_timing() - - #Check if can wait for kill to complete - affect signal used etc.... - - def set_kill_mode(self, signal=None, wait_and_kill=None, wait_time=None): - """Not currently implemented for BalsamJobController. - - No action is taken - """ - logger.warning("set_kill_mode currently has no action with Balsam controller") diff --git a/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py b/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py index c15d885cb..231ec4289 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py +++ b/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py @@ -25,6 +25,7 @@ def build_simfunc(): from libensemble.register import * from libensemble.controller import * +from libensemble.balsam_controller import * #sim_app = 'simdir/my_simjob.x' #gen_app = 'gendir/my_genjob.x' diff --git a/libensemble/tests/controller_tests/test_jobcontroller.py b/libensemble/tests/controller_tests/test_jobcontroller.py index bd901c278..1995f537c 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller.py +++ b/libensemble/tests/controller_tests/test_jobcontroller.py @@ -16,6 +16,7 @@ def build_simfunc(): from libensemble.register import * from libensemble.controller import * +from libensemble.balsam_controller import * #sim_app = 'simdir/my_simjob.x' #gen_app = 'gendir/my_genjob.x' diff --git a/libensemble/tests/controller_tests/test_jobcontroller_multi.py b/libensemble/tests/controller_tests/test_jobcontroller_multi.py index 38acfe475..efeadb658 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller_multi.py +++ b/libensemble/tests/controller_tests/test_jobcontroller_multi.py @@ -18,7 +18,8 @@ def build_simfunc(): #--------------- Calling script --------------------------------------------------------------- from libensemble.register import Register, BalsamRegister -from libensemble.controller import JobController, BalsamJobController +from libensemble.controller import JobController +from libensemble.baslam_controller import BalsamJobController #sim_app = 'simdir/my_simjob.x' #gen_app = 'gendir/my_genjob.x' diff --git a/libensemble/tests/regression_tests/test_jobcontroller_hworld.py b/libensemble/tests/regression_tests/test_jobcontroller_hworld.py index e8c8f2502..483b3d403 100644 --- a/libensemble/tests/regression_tests/test_jobcontroller_hworld.py +++ b/libensemble/tests/regression_tests/test_jobcontroller_hworld.py @@ -10,7 +10,8 @@ from libensemble.sim_funcs.job_control_hworld import job_control_hworld from libensemble.gen_funcs.uniform_sampling import uniform_random_sample from libensemble.register import Register, BalsamRegister -from libensemble.controller import JobController, BalsamJobController +from libensemble.controller import JobController +from libensemble.balsam_controller import BalsamJobController from libensemble.calc_info import CalcInfo from libensemble.resources import Resources from libensemble.message_numbers import * diff --git a/libensemble/tests/unit_tests/test_jobcontroller.py b/libensemble/tests/unit_tests/test_jobcontroller.py index 010d370c8..5cc7e7aa0 100644 --- a/libensemble/tests/unit_tests/test_jobcontroller.py +++ b/libensemble/tests/unit_tests/test_jobcontroller.py @@ -7,7 +7,8 @@ import pytest import socket from libensemble.register import Register, BalsamRegister -from libensemble.controller import JobController, BalsamJobController +from libensemble.controller import JobController +from libensemble.balsam_controller import BalsamJobController USE_BALSAM = False From a005805a8574cd57b827199bd1b2992066197628 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 22:46:50 -0500 Subject: [PATCH 036/101] Added wait_and_kill stuff to another job controller test. --- libensemble/tests/unit_tests/test_jobcontroller.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libensemble/tests/unit_tests/test_jobcontroller.py b/libensemble/tests/unit_tests/test_jobcontroller.py index 5cc7e7aa0..ba6392645 100644 --- a/libensemble/tests/unit_tests/test_jobcontroller.py +++ b/libensemble/tests/unit_tests/test_jobcontroller.py @@ -377,6 +377,7 @@ def test_launch_and_kill(): cores = NCORES args_for_sim = 'sleep 2.0' job_list = [] + jobctl.set_kill_mode(wait_and_kill=True, timeout=1) for jobid in range(5): job = jobctl.launch(calc_type='sim', num_procs=cores, app_args=args_for_sim) jobctl.kill(job) From 569bec5c580485ecf848f5639373356c7aa60006 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 6 Sep 2018 22:57:59 -0500 Subject: [PATCH 037/101] Update JobController to change interface for kill handling (just set wait_time) I think the standard way to do things *ought* to be to send a SIGTERM, wait politely for termination, then send a SIGKILL if the process does not wrap up before some timeout. We can do this just by setting a timeout argument (and use 0 to indicate "SIGKILL at once" and None to indicate "never SIGKILL"). --- libensemble/balsam_controller.py | 2 +- libensemble/controller.py | 64 ++----------------- .../tests/unit_tests/test_jobcontroller.py | 45 +------------ 3 files changed, 8 insertions(+), 103 deletions(-) diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py index 9e094d21e..bd6d85e7a 100644 --- a/libensemble/balsam_controller.py +++ b/libensemble/balsam_controller.py @@ -20,7 +20,7 @@ from libensemble.register import Register from libensemble.resources import Resources from libensemble.controller import \ - Job, JobController, JobControllerException, jassert, STATES, SIGNALS + Job, JobController, JobControllerException, jassert, STATES logger = logging.getLogger(__name__ + '(' + Resources.get_my_name() + ')') #For debug messages in this module - uncomment (see libE.py to change root logging level) diff --git a/libensemble/controller.py b/libensemble/controller.py index bed2039a6..9954c6b2d 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -34,13 +34,6 @@ USER_KILLED FAILED""".split() -SIGNALS = """ -SIGTERM -SIGKILL""".split() - - -#I may want to use a top-level abstract/base class for maximum re-use -# - else inherited controller will be reimplementing common code class JobControllerException(Exception): pass @@ -240,8 +233,6 @@ def __init__(self, registry=None, auto_resources=True, '-npernode {ranks_per_node}'], } self.mpi_command = mpi_commands[Resources.get_MPI_variant()] - self.kill_signal = 'SIGTERM' - self.wait_and_kill = True self.wait_time = 60 self.list_of_jobs = [] self.workerID = None @@ -452,12 +443,10 @@ def kill(self, job): job: obj: Job The job object.to be polled. - The signal used is determined by the job_controller attribute - will be send to the job, followed by a wait for - the process to terminate. If the attribute is - True, then a SIGKILL will be sent if the job has not finished - after seconds. The kill can be configured using the - set_kill_mode function. + Sends SIGTERM, waits for a period of for graceful + termination, then sends a hard kill with SIGKILL. If + is 0, we go immediately to SIGKILL; if is None, we + never do a SIGKILL. """ jassert(isinstance(job, Job), "Invalid job has been provided") @@ -474,55 +463,12 @@ def kill(self, job): "check jobs been launched".format(job.name)) logger.debug("Killing job {}".format(job.name)) - - jassert(self.kill_signal in ['SIGTERM', 'SIGKILL'], - "Unknown kill signal") - - timeout = 0 # Default is to just kill and wait - if self.kill_signal == 'SIGTERM': # For a graceful kill - timeout = None # Terminate and just wait - if self.wait_and_kill: # Or if we want to wait and kill... - timeout = self.wait_time # Set a timeout - - launcher.cancel(job.process, timeout) - + launcher.cancel(job.process, self.wait_time) job.state = 'USER_KILLED' job.finished = True job.calc_job_timing() - def set_kill_mode(self, signal=None, wait_and_kill=None, wait_time=None): - """Configures the kill mode for the job_controller - - Parameters - ---------- - - signal: String, optional - The signal type to be sent to kill job: 'SIGTERM' or 'SIGKILL' - - wait_and_kill: boolean, optional - If True, a SIGKILL will be sent after seconds if - the process has not terminated. - - wait_time: int, optional - The number of seconds to wait for the job to finish before - sending a SIGKILL when wait_and_kill is set. (Default is 60). - """ - if signal is not None: - jassert(signal in SIGNALS, - "Unknown signal {} supplied to set_kill_mode". - format(signal)) - self.kill_signal = signal - - if wait_and_kill is not None: - self.wait_and_kill = wait_and_kill - - if wait_time is not None: - self.wait_time = wait_time - if not wait_and_kill: - logger.warning('wait_time does nothing if wait_and_kill=False') - - def get_job(self, jobid): """ Returns the job object for the supplied job ID """ if self.list_of_jobs: diff --git a/libensemble/tests/unit_tests/test_jobcontroller.py b/libensemble/tests/unit_tests/test_jobcontroller.py index ba6392645..1109936ec 100644 --- a/libensemble/tests/unit_tests/test_jobcontroller.py +++ b/libensemble/tests/unit_tests/test_jobcontroller.py @@ -332,7 +332,7 @@ def test_doublekill(): args_for_sim = 'sleep 2.0' job = jobctl.launch(calc_type='sim', num_procs=cores, app_args=args_for_sim) jobctl.poll(job) - jobctl.set_kill_mode(wait_and_kill=True, wait_time=5) + jobctl.wait_time = 5 jobctl.kill(job) assert job.finished, "job.finished should be True. Returned " + str(job.finished) @@ -377,7 +377,7 @@ def test_launch_and_kill(): cores = NCORES args_for_sim = 'sleep 2.0' job_list = [] - jobctl.set_kill_mode(wait_and_kill=True, timeout=1) + jobctl.wait_time = 1 for jobid in range(5): job = jobctl.launch(calc_type='sim', num_procs=cores, app_args=args_for_sim) jobctl.kill(job) @@ -512,46 +512,6 @@ def test_poll_job_with_no_launch(): assert 0 -def test_set_kill_mode(): - print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) - setup_job_controller() - jobctl = JobController.controller - cores = NCORES - - signal_b4 = jobctl.kill_signal - wait_and_kill_b4 = jobctl.wait_and_kill - wait_time_b4 = jobctl.wait_time - - # Change nothing. - jobctl.set_kill_mode() - assert jobctl.kill_signal == signal_b4 - assert jobctl.wait_and_kill == wait_and_kill_b4 - assert jobctl.wait_time == wait_time_b4 - - # While these options are set - wait_time will not be used. Result is warning. - jobctl.set_kill_mode(signal='SIGKILL', wait_and_kill=False, wait_time=10) - assert jobctl.kill_signal == 'SIGKILL' - assert not jobctl.wait_and_kill - assert jobctl.wait_time == 10 - - # Now correct - jobctl.set_kill_mode(signal='SIGTERM', wait_and_kill=True, wait_time=20) - assert jobctl.kill_signal == 'SIGTERM' - assert jobctl.wait_and_kill - assert jobctl.wait_time == 20 - - #Todo: - #Testing wait_and_kill is harder - need to create a process that does not respond to sigterm in time. - - # Try set to unknown signal - try: - jobctl.set_kill_mode(signal='SIGDIE') - except: - assert 1 - else: - assert 0 - - def test_job_failure(): print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_job_controller() @@ -582,7 +542,6 @@ def test_job_failure(): test_launch_no_app() test_kill_job_with_no_launch() test_poll_job_with_no_launch() - test_set_kill_mode() test_job_failure() #teardown_module(__file__) From e6618aa1242cd05838cc6c1ceec458357421888f Mon Sep 17 00:00:00 2001 From: David Bindel Date: Fri, 7 Sep 2018 09:42:50 -0500 Subject: [PATCH 038/101] Killed some commented out dead code. --- libensemble/controller.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index c44b30331..e95769e5a 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -80,8 +80,6 @@ def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, self.ranks_per_node = ranks_per_node self.machinefile = machinefile self.hostlist = hostlist - #self.stdout = stdout - #self.stderr = stderr self.workerID = workerid jassert(app is not None, From 578211b8ebeda2de7f61b517eb2ff9bb0ac844c0 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Fri, 7 Sep 2018 10:49:48 -0500 Subject: [PATCH 039/101] Minor moving about in manager. --- libensemble/libE_manager.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/libensemble/libE_manager.py b/libensemble/libE_manager.py index 5840a322d..adb5203fb 100644 --- a/libensemble/libE_manager.py +++ b/libensemble/libE_manager.py @@ -47,6 +47,13 @@ def filter_nans(array): return array[~np.isnan(array)] +_WALLCLOCK_MSG = """ +Termination due to elapsed_wallclock_time has occurred. +A last attempt has been made to receive any completed work. +Posting nonblocking receives and kill messages for all active workers. +""" + + class Manager: """Manager class for libensemble.""" @@ -177,9 +184,9 @@ def _save_every_k_gens(self): def _check_work_order(self, Work, w): """Check validity of an allocation function order. """ - assert w != 0, "Can't send to worker 0; this is the manager. Aborting" + assert w != 0, "Can't send to worker 0; this is the manager." assert self.W[w-1]['active'] == 0, \ - "Allocation function requested work to an already active worker. Aborting" + "Allocation function requested work to an already active worker." work_rows = Work['libE_info']['H_rows'] if len(work_rows): work_fields = set(Work['H_fields']) @@ -224,7 +231,8 @@ def _check_received_calc(D_recv): calc_type = D_recv['calc_type'] calc_status = D_recv['calc_status'] assert calc_type in [EVAL_SIM_TAG, EVAL_GEN_TAG], \ - 'Aborting, Unknown calculation type received. Received type: ' + str(calc_type) + "Aborting, Unknown calculation type received. " \ + "Received type: {}".format(calc_type) assert calc_status in [FINISHED_PERSISTENT_SIM_TAG, FINISHED_PERSISTENT_GEN_TAG, UNSET_TAG, @@ -235,7 +243,8 @@ def _check_received_calc(D_recv): WORKER_KILL, JOB_FAILED, WORKER_DONE], \ - 'Aborting: Unknown calculation status received. Received status: ' + str(calc_status) + "Aborting: Unknown calculation status received. " \ + "Received status: {}".format(calc_status) def _receive_from_workers(self, persis_info): """Receive calculation output from workers. Loops over all @@ -333,6 +342,9 @@ def _final_receive_and_kill(self, persis_info): persis_info = self._receive_from_workers(persis_info) if self.term_test(logged=False) == 2 and any(self.W['active']): self._print_wallclock_term() + print(_WALLCLOCK_MSG) + sys.stdout.flush() + sys.stderr.flush() self._read_final_messages() exit_flag = 2 @@ -340,15 +352,6 @@ def _final_receive_and_kill(self, persis_info): print("\nlibEnsemble manager total time:", self.elapsed()) return persis_info, exit_flag - @staticmethod - def _print_wallclock_term(): - """Print termination message for wall clock elapsed.""" - print("Termination due to elapsed_wallclock_time has occurred.\n"\ - "A last attempt has been made to receive any completed work.\n"\ - "Posting nonblocking receives and kill messages for all active workers\n") - sys.stdout.flush() - sys.stderr.flush() - # --- Main loop def _queue_update(self, H, persis_info): From eb8c347e22d3bd3dc6b2930a976cb7d9ba0801ea Mon Sep 17 00:00:00 2001 From: David Bindel Date: Fri, 7 Sep 2018 11:40:17 -0500 Subject: [PATCH 040/101] Fixed incomplete move of a print statement... --- libensemble/libE_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libensemble/libE_manager.py b/libensemble/libE_manager.py index adb5203fb..cbf326046 100644 --- a/libensemble/libE_manager.py +++ b/libensemble/libE_manager.py @@ -341,7 +341,6 @@ def _final_receive_and_kill(self, persis_info): while any(self.W['active']) and exit_flag == 0: persis_info = self._receive_from_workers(persis_info) if self.term_test(logged=False) == 2 and any(self.W['active']): - self._print_wallclock_term() print(_WALLCLOCK_MSG) sys.stdout.flush() sys.stderr.flush() From a82a14ccebf05b0b31b17e694ccc594307d78c9b Mon Sep 17 00:00:00 2001 From: David Bindel Date: Fri, 7 Sep 2018 14:49:14 -0500 Subject: [PATCH 041/101] Moved poll and kill into job. --- libensemble/balsam_controller.py | 175 ++++++++---------- libensemble/controller.py | 142 +++++++------- libensemble/sim_funcs/job_control_hworld.py | 2 +- .../test_jobcontroller.manager_poll.py | 4 +- .../controller_tests/test_jobcontroller.py | 4 +- .../test_jobcontroller_multi.py | 6 +- .../tests/unit_tests/test_jobcontroller.py | 21 +-- 7 files changed, 149 insertions(+), 205 deletions(-) diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py index 02864805b..c7538ad5a 100644 --- a/libensemble/balsam_controller.py +++ b/libensemble/balsam_controller.py @@ -76,47 +76,95 @@ def calc_job_timing(self): if self.total_time is None: self.total_time = time.time() - self.launch_time + def poll(self): + """Polls and updates the status attributes of the supplied job""" + jassert(isinstance(self, BalsamJob), "Invalid job has been provided") -class BalsamJobController(JobController): - """Inherits from JobController and wraps the Balsam job management service + # Check the jobs been launched (i.e. it has a process ID) + #Prob should be recoverable and return state - but currently fatal + jassert(self.process, "Polled job has no process ID - check jobs been launched") - .. note:: Job kills are currently not configurable in the Balsam job_controller. + # Do not poll if job already finished + if self.finished: + logger.warning("Polled job has already finished. Not re-polling. " + "Status is {}".format(self.state)) + return - The set_kill_mode function will do nothing but print a warning. + #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# - """ + # Get current state of jobs from Balsam database + self.process.refresh_from_db() + self.balsam_state = self.process.state #Not really nec to copy have balsam_state - already job.process.state... + #logger.debug('balsam_state for job {} is {}'.format(self.id, self.balsam_state)) - #controller = None + import balsam.launcher.dag as dag #Might need this before get models - test + from balsam.service import models - def __init__(self, registry=None, auto_resources=True, - nodelist_env_slurm=None, nodelist_env_cobalt=None): - """Instantiate a new BalsamJobController instance. + if self.balsam_state in models.END_STATES: + self.finished = True + + self.calc_job_timing() + + if self.workdir is None: + self.workdir = self.process.working_directory + if self.balsam_state == 'JOB_FINISHED': + self.success = True + self.state = 'FINISHED' + elif self.balsam_state == 'PARENT_KILLED': #I'm not using this currently + self.state = 'USER_KILLED' + #self.success = False #Shld already be false - init to false + #self.errcode = #Not currently returned by Balsam API - requested - else will remain as None + elif self.balsam_state in STATES: #In my states + self.state = self.balsam_state + #self.success = False #All other end states are failrues currently - bit risky + #self.errcode = #Not currently returned by Balsam API - requested - else will remain as None + else: + logger.warning("Job finished, but in unrecognized " + "Balsam state {}".format(self.balsam_state)) + self.state = 'UNKNOWN' - A new BalsamJobController object is created with an application - registry and configuration attributes - """ + elif self.balsam_state in models.ACTIVE_STATES: + self.state = 'RUNNING' + if self.workdir is None: + self.workdir = self.process.working_directory - #Will use super - atleast if use baseclass - but for now dont want to set self.mpi_launcher etc... + elif self.balsam_state in models.PROCESSABLE_STATES + models.RUNNABLE_STATES: #Does this work - concatenate lists + self.state = 'WAITING' + else: + raise JobControllerException( + "Job state returned from Balsam is not in known list of " + "Balsam states. Job state is {}".format(self.balsam_state)) - self.registry = registry or Register.default_registry - jassert(self.registry, "Cannot find default registry") - self.top_level_dir = os.getcwd() - self.auto_resources = auto_resources + def kill(self, wait_time=None): + """ Kills or cancels the supplied job """ - if self.auto_resources: - self.resources = Resources(top_level_dir=self.top_level_dir, central_mode=True, - nodelist_env_slurm=nodelist_env_slurm, - nodelist_env_cobalt=nodelist_env_cobalt) + import balsam.launcher.dag as dag + dag.kill(self.process) - #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# + #Could have Wait here and check with Balsam its killed - but not implemented yet. + + self.state = 'USER_KILLED' + self.finished = True + self.calc_job_timing() - self.list_of_jobs = [] #Why did I put here? Will inherit - #self.auto_machinefile = False #May in future use the auto_detect part though - to fill in procs/nodes/ranks_per_node +class BalsamJobController(JobController): + """Inherits from JobController and wraps the Balsam job management service + + .. note:: Job kills are not configurable in the Balsam job_controller. + + """ + def __init__(self, registry=None, auto_resources=True, + nodelist_env_slurm=None, nodelist_env_cobalt=None): + """Instantiate a new BalsamJobController instance. - JobController.controller = self - #BalsamJobController.controller = self + A new BalsamJobController object is created with an application + registry and configuration attributes + """ + super().__init__(registry, auto_resources, + nodelist_env_slurm, nodelist_env_cobalt) + self.mpi_launcher = None def launch(self, calc_type, num_procs=None, num_nodes=None, @@ -198,78 +246,3 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, #job.workdir = job.process.working_directory #Might not be set yet!!!! self.list_of_jobs.append(job) return job - - - def poll(self, job): - """Polls and updates the status attributes of the supplied job""" - jassert(isinstance(job, BalsamJob), "Invalid job has been provided") - - # Check the jobs been launched (i.e. it has a process ID) - #Prob should be recoverable and return state - but currently fatal - jassert(job.process, "Polled job has no process ID - check jobs been launched") - - # Do not poll if job already finished - if job.finished: - logger.warning("Polled job has already finished. Not re-polling. " - "Status is {}".format(job.state)) - return - - #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# - - # Get current state of jobs from Balsam database - job.process.refresh_from_db() - job.balsam_state = job.process.state #Not really nec to copy have balsam_state - already job.process.state... - #logger.debug('balsam_state for job {} is {}'.format(job.id, job.balsam_state)) - - import balsam.launcher.dag as dag #Might need this before get models - test - from balsam.service import models - - if job.balsam_state in models.END_STATES: - job.finished = True - - job.calc_job_timing() - - if job.workdir is None: - job.workdir = job.process.working_directory - if job.balsam_state == 'JOB_FINISHED': - job.success = True - job.state = 'FINISHED' - elif job.balsam_state == 'PARENT_KILLED': #I'm not using this currently - job.state = 'USER_KILLED' - #job.success = False #Shld already be false - init to false - #job.errcode = #Not currently returned by Balsam API - requested - else will remain as None - elif job.balsam_state in STATES: #In my states - job.state = job.balsam_state - #job.success = False #All other end states are failrues currently - bit risky - #job.errcode = #Not currently returned by Balsam API - requested - else will remain as None - else: - logger.warning("Job finished, but in unrecognized Balsam state {}".format(job.balsam_state)) - job.state = 'UNKNOWN' - - elif job.balsam_state in models.ACTIVE_STATES: - job.state = 'RUNNING' - if job.workdir is None: - job.workdir = job.process.working_directory - - elif job.balsam_state in models.PROCESSABLE_STATES + models.RUNNABLE_STATES: #Does this work - concatenate lists - job.state = 'WAITING' - else: - raise JobControllerException('Job state returned from Balsam is not in known list of Balsam states. Job state is {}'.format(job.balsam_state)) - - # DSB: With this commented out, number of return args is inconsistent (returns job above) - #return job - - def kill(self, job): - """ Kills or cancels the supplied job """ - - jassert(isinstance(job, BalsamJob), "Invalid job has been provided") - - import balsam.launcher.dag as dag - dag.kill(job.process) - - #Could have Wait here and check with Balsam its killed - but not implemented yet. - - job.state = 'USER_KILLED' - job.finished = True - job.calc_job_timing() - diff --git a/libensemble/controller.py b/libensemble/controller.py index 505c3cf67..8ed7e9464 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -143,21 +143,73 @@ def calc_job_timing(self): if self.total_time is None: self.total_time = self.runtime + def poll(self): + """Polls and updates the status attributes of the job""" + + jassert(self.process is not None, + "Polled job {} has no process ID - check jobs been launched". + format(self.name)) + if self.finished: + logger.warning("Polled job {} has already finished. " + "Not re-polling. Status is {}". + format(self.name, self.state)) + return + + #-------- Up to here should be common - can go in a baseclass ------# + + # Poll the job + poll = self.process.poll() + if poll is None: + self.state = 'RUNNING' + return + + self.finished = True + self.calc_job_timing() + + # Want to be more fine-grained about non-success (fail vs user kill?) + self.errcode = self.process.returncode + self.success = (self.errcode == 0) + self.state = 'FINISHED' if self.success else 'FAILED' + logger.debug("Job {} completed with errcode {} ({})". + format(self.name, self.errcode, self.state)) + + def kill(self, wait_time=60): + """Kills or cancels the supplied job + + Sends SIGTERM, waits for a period of for graceful + termination, then sends a hard kill with SIGKILL. If + is 0, we go immediately to SIGKILL; if is None, we + never do a SIGKILL. + """ + if self.finished: + logger.warning("Trying to kill job that is no longer running. " + "Job {}: Status is {}".format(self.name, self.state)) + return + + if self.process is None: + time.sleep(0.2) + jassert(self.process is not None, + "Attempting to kill job {} that has no process ID - " + "check jobs been launched".format(self.name)) + + logger.debug("Killing job {}".format(self.name)) + launcher.cancel(self.process, wait_time) + self.state = 'USER_KILLED' + self.finished = True + self.calc_job_timing() + class JobController: """The job_controller can create, poll and kill runnable jobs **Class Attributes:** - :cvar JobController: controller: A class attribute holding the default job_controller. + :cvar JobController: controller: The default job_controller. **Object Attributes:** :ivar Register registry: The registry associated with this job_controller - :ivar String manager_signal: Contains any signals received by manager ('none'|'finish'|'kill') - :ivar String kill_signal: The kill signal to be sent to jobs - :ivar boolean wait_and_kill: Whether running in wait_and_kill mode (If True a hard kill will be sent after a timeout period) - :ivar int wait_time: Timeout period for hard kill, when wait_and_kill is set. + :ivar int wait_time: Timeout period for hard kill :ivar list list_of_jobs: A list of jobs created in this job controller :ivar int workerID: The workerID associated with this job controller @@ -381,46 +433,6 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, return job - def poll(self, job): - """ Polls and updates the status attributes of the supplied job - - Parameters - ----------- - - job: obj: Job - The job object.to be polled. - - """ - - jassert(isinstance(job, Job), "Invalid job has been provided") - jassert(job.process is not None, - "Polled job {} has no process ID - check jobs been launched". - format(job.name)) - if job.finished: - logger.warning("Polled job {} has already finished. " - "Not re-polling. Status is {}". - format(job.name, job.state)) - return - - #-------- Up to here should be common - can go in a baseclass ------# - - # Poll the job - poll = job.process.poll() - if poll is None: - job.state = 'RUNNING' - return - - job.finished = True - job.calc_job_timing() - - # Want to be more fine-grained about non-success (fail vs user kill?) - job.errcode = job.process.returncode - job.success = (job.errcode == 0) - job.state = 'FINISHED' if job.success else 'FAILED' - logger.debug("Job {} completed with errcode {} ({})". - format(job.name, job.errcode, job.state)) - - def manager_poll(self): """ Polls for a manager signal @@ -449,41 +461,6 @@ def manager_poll(self): "ignoring".format(man_signal)) - def kill(self, job): - """Kills or cancels the supplied job - - Parameters - ----------- - - job: obj: Job - The job object.to be polled. - - Sends SIGTERM, waits for a period of for graceful - termination, then sends a hard kill with SIGKILL. If - is 0, we go immediately to SIGKILL; if is None, we - never do a SIGKILL. - """ - - jassert(isinstance(job, Job), "Invalid job has been provided") - - if job.finished: - logger.warning("Trying to kill job that is no longer running. " - "Job {}: Status is {}".format(job.name, job.state)) - return - - if job.process is None: - time.sleep(0.2) - jassert(job.process is not None, - "Attempting to kill job {} that has no process ID - " - "check jobs been launched".format(job.name)) - - logger.debug("Killing job {}".format(job.name)) - launcher.cancel(job.process, self.wait_time) - job.state = 'USER_KILLED' - job.finished = True - job.calc_job_timing() - - def get_job(self, jobid): """ Returns the job object for the supplied job ID """ if self.list_of_jobs: @@ -621,3 +598,8 @@ def get_hostlist(self): node_list = self.resources.local_nodelist hostlist_str = ",".join([str(x) for x in node_list]) return hostlist_str + + def kill(self, job): + "Kill a job" + jassert(isinstance(job, Job), "Invalid job has been provided") + job.kill(self.wait_time) diff --git a/libensemble/sim_funcs/job_control_hworld.py b/libensemble/sim_funcs/job_control_hworld.py index 88d8ca11f..77ef903f1 100644 --- a/libensemble/sim_funcs/job_control_hworld.py +++ b/libensemble/sim_funcs/job_control_hworld.py @@ -25,7 +25,7 @@ def polling_loop(jobctl, job, timeout_sec=6.0, delay=1.0): break #print('Polling job at time', time.time() - start) - jobctl.poll(job) + job.poll() if job.finished: break elif job.state == 'RUNNING': diff --git a/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py b/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py index 231ec4289..a6564a2d9 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py +++ b/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py @@ -76,7 +76,7 @@ def polling_loop(jobctl, job, timeout_sec=20.0, delay=2.0): time.sleep(delay) print('Polling at time', time.time() - start) - jobctl.poll(job) + job.poll() if job.finished: break elif job.state == 'WAITING': print('Job waiting to launch') elif job.state == 'RUNNING': print('Job still running ....') @@ -97,7 +97,7 @@ def polling_loop(jobctl, job, timeout_sec=20.0, delay=2.0): if job.finished: print('Now killed') #double check - jobctl.poll(job) + job.poll() print('Job state is', job.state) diff --git a/libensemble/tests/controller_tests/test_jobcontroller.py b/libensemble/tests/controller_tests/test_jobcontroller.py index 1995f537c..5e9da2e1a 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller.py +++ b/libensemble/tests/controller_tests/test_jobcontroller.py @@ -55,7 +55,7 @@ def polling_loop(jobctl, job, timeout_sec=20.0, delay=2.0): while time.time() - start < timeout_sec: time.sleep(delay) print('Polling at time', time.time() - start) - jobctl.poll(job) + job.poll() if job.finished: break elif job.state == 'WAITING': print('Job waiting to launch') elif job.state == 'RUNNING': print('Job still running ....') @@ -83,7 +83,7 @@ def polling_loop(jobctl, job, timeout_sec=20.0, delay=2.0): if job.finished: print('Now killed') #double check - jobctl.poll(job) + job.poll() print('Job state is', job.state) diff --git a/libensemble/tests/controller_tests/test_jobcontroller_multi.py b/libensemble/tests/controller_tests/test_jobcontroller_multi.py index efeadb658..4bb6c1eb3 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller_multi.py +++ b/libensemble/tests/controller_tests/test_jobcontroller_multi.py @@ -67,8 +67,7 @@ def polling_loop(jobctl, job_list, timeout_sec=40.0, delay=1.0): if not job.finished: time.sleep(delay) print('Polling job %d at time %f' % (job.id, time.time() - start)) - #job.poll() - jobctl.poll(job) + job.poll() if job.finished: continue elif job.state == 'WAITING': print('Job %d waiting to launch' % (job.id)) elif job.state == 'RUNNING': print('Job %d still running ....' % (job.id)) @@ -105,8 +104,7 @@ def polling_loop(jobctl, job_list, timeout_sec=40.0, delay=1.0): if job.finished: print('Job %d Now killed. Status: %s' % (job.id, job.state)) #double check - #job.poll() - jobctl.poll(job) + job.poll() print('Job %d state is %s' % (job.id, job.state)) diff --git a/libensemble/tests/unit_tests/test_jobcontroller.py b/libensemble/tests/unit_tests/test_jobcontroller.py index 1109936ec..9db3fadeb 100644 --- a/libensemble/tests/unit_tests/test_jobcontroller.py +++ b/libensemble/tests/unit_tests/test_jobcontroller.py @@ -115,7 +115,7 @@ def polling_loop(jobctl, job, timeout_sec=0.5, delay=0.05): while time.time() - start < timeout_sec: time.sleep(delay) print('Polling at time', time.time() - start) - jobctl.poll(job) + job.poll() if job.finished: break elif job.state == 'WAITING': print('Job waiting to launch') elif job.state == 'RUNNING': print('Job still running ....') @@ -150,8 +150,7 @@ def polling_loop_multijob(jobctl, job_list, timeout_sec=4.0, delay=0.05): if not job.finished: time.sleep(delay) print('Polling job %d at time %f' % (job.id, time.time() - start)) - #job.poll() - jobctl.poll(job) + job.poll() if job.finished: continue elif job.state == 'WAITING': print('Job %d waiting to launch' % (job.id)) elif job.state == 'RUNNING': print('Job %d still running ....' % (job.id)) @@ -331,7 +330,7 @@ def test_doublekill(): cores = NCORES args_for_sim = 'sleep 2.0' job = jobctl.launch(calc_type='sim', num_procs=cores, app_args=args_for_sim) - jobctl.poll(job) + job.poll() jobctl.wait_time = 5 jobctl.kill(job) @@ -356,14 +355,14 @@ def test_finish_and_kill(): job = jobctl.launch(calc_type='sim', num_procs=cores, app_args=args_for_sim) while not job.finished: time.sleep(0.1) - jobctl.poll(job) + job.poll() assert job.finished, "job.finished should be True. Returned " + str(job.finished) assert job.state == 'FINISHED', "job.state should be FINISHED. Returned " + str(job.state) jobctl.kill(job) assert job.finished, "job.finished should be True. Returned " + str(job.finished) assert job.state == 'FINISHED', "job.state should be FINISHED. Returned " + str(job.state) #Try polling after finish - should return with no effect - jobctl.poll(job) + job.poll() assert job.finished, "job.finished should be True. Returned " + str(job.finished) assert job.state == 'FINISHED', "job.state should be FINISHED. Returned " + str(job.state) @@ -492,20 +491,12 @@ def test_poll_job_with_no_launch(): jobctl = JobController.controller cores = NCORES - #Try poll invalid job - try: - jobctl.poll('myjob') - except: - assert 1 - else: - assert 0 - # Create a job directly with no launch (Not supported for users) registry = Register.default_registry myapp = registry.sim_default_app job1 = Job(app = myapp, stdout = 'stdout.txt') try: - jobctl.poll(job1) + job1.poll() except: assert 1 else: From 68b402715144440315ba4933acc34ed6cb409ebc Mon Sep 17 00:00:00 2001 From: David Bindel Date: Fri, 7 Sep 2018 15:29:04 -0500 Subject: [PATCH 042/101] Simplified get_job iteration. --- libensemble/controller.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index 8ed7e9464..95b8d7e21 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -463,15 +463,10 @@ def manager_poll(self): def get_job(self, jobid): """ Returns the job object for the supplied job ID """ - if self.list_of_jobs: - for job in self.list_of_jobs: - if job.id == jobid: - return job + job = next((j for j in self.list_of_jobs if j.id == jobid), None) + if job is None: logger.warning("Job {} not found in joblist".format(jobid)) - return None - logger.warning("Job {} not found in joblist. Joblist is empty". - format(jobid)) - return None + return job def set_workerID(self, workerid): From b60dfafcaaf9878bac643b75301dded2f2f77888 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Fri, 7 Sep 2018 15:44:35 -0500 Subject: [PATCH 043/101] Killed dead manager_signal in controller. --- libensemble/controller.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index 95b8d7e21..aa8383693 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -70,7 +70,6 @@ def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, self.launch_time = None self.runtime = None self.total_time = None - self.manager_signal = 'none' #Run attributes self.app = app From b018469da4d86f3fd87338bed21c19ba5f13cbf1 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Fri, 7 Sep 2018 16:34:38 -0500 Subject: [PATCH 044/101] Minor cleanup in balsam_controller. --- libensemble/balsam_controller.py | 34 +++++++++++++++++--------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py index c7538ad5a..4e6588ff3 100644 --- a/libensemble/balsam_controller.py +++ b/libensemble/balsam_controller.py @@ -21,7 +21,8 @@ Job, JobController, JobControllerException, jassert, STATES logger = logging.getLogger(__name__ + '(' + Resources.get_my_name() + ')') -#For debug messages in this module - uncomment (see libE.py to change root logging level) +#For debug messages in this module - uncomment +#(see libE.py to change root logging level) #logger.setLevel(logging.DEBUG) @@ -90,14 +91,15 @@ def poll(self): "Status is {}".format(self.state)) return - #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# + #-------- Up to here should be common - can go in a baseclass ------# # Get current state of jobs from Balsam database self.process.refresh_from_db() - self.balsam_state = self.process.state #Not really nec to copy have balsam_state - already job.process.state... - #logger.debug('balsam_state for job {} is {}'.format(self.id, self.balsam_state)) + self.balsam_state = self.process.state + #Not really nec to copy have balsam_state - already job.process.state... - import balsam.launcher.dag as dag #Might need this before get models - test + #Might need this before get models - test + import balsam.launcher.dag as dag from balsam.service import models if self.balsam_state in models.END_STATES: @@ -105,19 +107,18 @@ def poll(self): self.calc_job_timing() - if self.workdir is None: - self.workdir = self.process.working_directory + self.workdir = self.workdir or self.process.working_directory if self.balsam_state == 'JOB_FINISHED': self.success = True self.state = 'FINISHED' - elif self.balsam_state == 'PARENT_KILLED': #I'm not using this currently + elif self.balsam_state == 'PARENT_KILLED': # Not currently used self.state = 'USER_KILLED' #self.success = False #Shld already be false - init to false - #self.errcode = #Not currently returned by Balsam API - requested - else will remain as None + #self.errcode = #Not currently returned by Balsam (requested) elif self.balsam_state in STATES: #In my states self.state = self.balsam_state - #self.success = False #All other end states are failrues currently - bit risky - #self.errcode = #Not currently returned by Balsam API - requested - else will remain as None + #self.success = False #All other end states are failures currently - bit risky + #self.errcode = #Not currently returned by Balsam (requested) else: logger.warning("Job finished, but in unrecognized " "Balsam state {}".format(self.balsam_state)) @@ -125,8 +126,7 @@ def poll(self): elif self.balsam_state in models.ACTIVE_STATES: self.state = 'RUNNING' - if self.workdir is None: - self.workdir = self.process.working_directory + self.workdir = self.workdir or self.process.working_directory elif self.balsam_state in models.PROCESSABLE_STATES + models.RUNNABLE_STATES: #Does this work - concatenate lists self.state = 'WAITING' @@ -142,7 +142,8 @@ def kill(self, wait_time=None): import balsam.launcher.dag as dag dag.kill(self.process) - #Could have Wait here and check with Balsam its killed - but not implemented yet. + #Could have Wait here and check with Balsam its killed - + #but not implemented yet. self.state = 'USER_KILLED' self.finished = True @@ -183,9 +184,10 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, "Unrecognized calculation type", calc_type) jassert(app, "Default {} app is not set".format(calc_type)) - #-------- Up to here should be common - can go in a baseclass and make all concrete classes inherit ------# + #-------- Up to here should be common - can go in a baseclass ------# - #Need test somewhere for if no breakdown supplied.... or only machinefile + #Need test somewhere for if no breakdown supplied.... + #or only machinefile #Specific to this class if machinefile is not None: From b0392af30f186dad8e236407bd2f3087d6efdafb Mon Sep 17 00:00:00 2001 From: David Bindel Date: Fri, 7 Sep 2018 17:28:45 -0500 Subject: [PATCH 045/101] Clean up unused state information in Job and BalsamJob. --- libensemble/balsam_controller.py | 77 ++++++++++++++------------------ libensemble/controller.py | 21 ++++----- 2 files changed, 42 insertions(+), 56 deletions(-) diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py index 4e6588ff3..f2af70403 100644 --- a/libensemble/balsam_controller.py +++ b/libensemble/balsam_controller.py @@ -15,7 +15,6 @@ import logging import time -from libensemble.register import Register from libensemble.resources import Resources from libensemble.controller import \ Job, JobController, JobControllerException, jassert, STATES @@ -33,27 +32,16 @@ class BalsamJob(Job): """ - def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, - ranks_per_node=None, machinefile=None, hostlist=None, - workdir=None, stdout=None, stderr=None, workerid=None): + def __init__(self, app=None, app_args=None, workdir=None, + stdout=None, stderr=None, workerid=None): """Instantiate a new BalsamJob instance. A new BalsamJob object is created with an id, status and configuration attributes. This will normally be created by the job_controller on a launch. """ - - super().__init__(app, app_args, num_procs, num_nodes, ranks_per_node, - machinefile, hostlist, workdir, stdout, stderr, - workerid) - - self.balsam_state = None - - #prob want to override workdir attribute with Balsam value - - #though does it exist yet? - #self.workdir = None #Don't know until starts running - self.workdir = workdir #Default for libe now is to run in place. - + # May want to override workdir with Balsam value when it exists + Job.__init__(self, app, app_args, workdir, stdout, stderr, workerid) def read_file_in_workdir(self, filename): return self.process.read_file_in_workdir(filename) @@ -83,7 +71,8 @@ def poll(self): # Check the jobs been launched (i.e. it has a process ID) #Prob should be recoverable and return state - but currently fatal - jassert(self.process, "Polled job has no process ID - check jobs been launched") + jassert(self.process, + "Polled job has no process ID - check jobs been launched") # Do not poll if job already finished if self.finished: @@ -95,45 +84,42 @@ def poll(self): # Get current state of jobs from Balsam database self.process.refresh_from_db() - self.balsam_state = self.process.state - #Not really nec to copy have balsam_state - already job.process.state... + balsam_state = self.process.state #Might need this before get models - test import balsam.launcher.dag as dag from balsam.service import models - if self.balsam_state in models.END_STATES: + if balsam_state in models.END_STATES: self.finished = True - self.calc_job_timing() - self.workdir = self.workdir or self.process.working_directory - if self.balsam_state == 'JOB_FINISHED': - self.success = True + self.success = (balsam_state == 'JOB_FINISHED') + # self.errcode - requested feature from Balsam devs + + if balsam_state == 'JOB_FINISHED': self.state = 'FINISHED' - elif self.balsam_state == 'PARENT_KILLED': # Not currently used + elif balsam_state == 'PARENT_KILLED': # Not currently used self.state = 'USER_KILLED' - #self.success = False #Shld already be false - init to false - #self.errcode = #Not currently returned by Balsam (requested) - elif self.balsam_state in STATES: #In my states - self.state = self.balsam_state - #self.success = False #All other end states are failures currently - bit risky - #self.errcode = #Not currently returned by Balsam (requested) + elif balsam_state in STATES: #In my states + self.state = balsam_state else: logger.warning("Job finished, but in unrecognized " - "Balsam state {}".format(self.balsam_state)) + "Balsam state {}".format(balsam_state)) self.state = 'UNKNOWN' - elif self.balsam_state in models.ACTIVE_STATES: + elif balsam_state in models.ACTIVE_STATES: self.state = 'RUNNING' self.workdir = self.workdir or self.process.working_directory - elif self.balsam_state in models.PROCESSABLE_STATES + models.RUNNABLE_STATES: #Does this work - concatenate lists + elif (balsam_state in models.PROCESSABLE_STATES or + balsam_state in models.RUNNABLE_STATES): self.state = 'WAITING' + else: raise JobControllerException( "Job state returned from Balsam is not in known list of " - "Balsam states. Job state is {}".format(self.balsam_state)) + "Balsam states. Job state is {}".format(balsam_state)) def kill(self, wait_time=None): @@ -207,8 +193,9 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, if self.auto_resources: num_procs, num_nodes, ranks_per_node = self.get_resources(num_procs=num_procs, num_nodes=num_nodes, ranks_per_node=ranks_per_node, hyperthreads=hyperthreads) else: - #Without resource detection - num_procs, num_nodes, ranks_per_node = JobController.job_partition(num_procs, num_nodes, ranks_per_node) #Note: not included machinefile option + #Without resource detection (note: not included machinefile option) + num_procs, num_nodes, ranks_per_node = \ + JobController.job_partition(num_procs, num_nodes, ranks_per_node) #temp - while balsam does not accept a standard out name if stdout is not None or stderr is not None: @@ -217,11 +204,11 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, stdout = None stderr = None - #Will be possible to override with arg when implemented (or can have option to let Balsam assign) + #Will be possible to override with arg when implemented + #(or can have option to let Balsam assign) default_workdir = os.getcwd() - - hostlist = None - job = BalsamJob(app, app_args, num_procs, num_nodes, ranks_per_node, machinefile, hostlist, default_workdir, stdout, stderr, self.workerID) + job = BalsamJob(app, app_args, default_workdir, + stdout, stderr, self.workerID) #This is not used with Balsam for run-time as this would include wait time #Again considering changing launch to submit - or whatever I chose before..... @@ -232,8 +219,8 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, 'user_workdir': default_workdir, #add arg for this 'application': app.name, 'args': job.app_args, - 'num_nodes': job.num_nodes, - 'ranks_per_node': job.ranks_per_node} + 'num_nodes': num_nodes, + 'ranks_per_node': ranks_per_node} if stage_inout is not None: #For now hardcode staging - for testing @@ -243,7 +230,9 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, job.process = dag.add_job(**add_job_args) - logger.debug("Added job to Balsam database {}: Worker {} nodes {} ppn {}".format(job.name, self.workerID, job.num_nodes, job.ranks_per_node)) + logger.debug("Added job to Balsam database {}: " + "Worker {} nodes {} ppn {}". + format(job.name, self.workerID, num_nodes, ranks_per_node)) #job.workdir = job.process.working_directory #Might not be set yet!!!! self.list_of_jobs.append(job) diff --git a/libensemble/controller.py b/libensemble/controller.py index aa8383693..568795e81 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -50,9 +50,8 @@ class Job: newid = itertools.count() - def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, - ranks_per_node=None, machinefile=None, hostlist=None, - workdir=None, stdout=None, stderr=None, workerid=None): + def __init__(self, app=None, app_args=None, workdir=None, + stdout=None, stderr=None, workerid=None): """Instantiate a new Job instance. A new job object is created with an id, status and configuration @@ -74,11 +73,6 @@ def __init__(self, app=None, app_args=None, num_procs=None, num_nodes=None, #Run attributes self.app = app self.app_args = app_args - self.num_procs = num_procs - self.num_nodes = num_nodes - self.ranks_per_node = ranks_per_node - self.machinefile = machinefile - self.hostlist = hostlist self.workerID = workerid jassert(app is not None, @@ -403,15 +397,18 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node, machinefile) default_workdir = os.getcwd() - job = Job(app, app_args, num_procs, num_nodes, ranks_per_node, - machinefile, hostlist, default_workdir, stdout, stderr, - self.workerID) + job = Job(app, app_args, default_workdir, stdout, stderr, self.workerID) if stage_inout is not None: logger.warning("stage_inout option ignored in this " "job_controller - runs in-place") - runline = launcher.form_command(self.mpi_command, vars(job)) + mpi_specs = {'num_procs': num_procs, + 'num_nodes': num_nodes, + 'ranks_per_node': ranks_per_node, + 'machinefile': machinefile, + 'hostlist': hostlist} + runline = launcher.form_command(self.mpi_command, mpi_specs) runline.append(job.app.full_path) if job.app_args is not None: runline.extend(job.app_args.split()) From 5314056b905b134306a81e63f09bbc451f41f0a1 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Fri, 7 Sep 2018 17:49:02 -0500 Subject: [PATCH 046/101] Small tightening up in controller. --- libensemble/controller.py | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index 568795e81..c2e0e4135 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -66,6 +66,8 @@ def __init__(self, app=None, app_args=None, workdir=None, self.errcode = None self.finished = False # True means job ran, not that it succeeded self.success = False + + # Note: runtime, total_time, and time since launch may differ! self.launch_time = None self.runtime = None self.total_time = None @@ -119,9 +121,6 @@ def read_stderr(self): """Open and reads the job's stderr file in the job's workdir""" return self.read_file_in_workdir(self.stderr) - #Note - this is currently only final job-time. May make running job time. - #And prob want to use for polling in sim func - esp in balsam - - #where want acutal runtime not time since launch def calc_job_timing(self): """Calculate timing information for this job""" if self.launch_time is None: @@ -485,22 +484,17 @@ def get_resources(self, num_procs=None, num_nodes=None, raised if these are infeasible. """ - node_list = self.resources.local_nodelist - - if hyperthreads: - cores_avail_per_node = self.resources.logical_cores_avail_per_node - else: - cores_avail_per_node = self.resources.physical_cores_avail_per_node - - num_workers = self.resources.num_workers - local_node_count = self.resources.local_node_count + resources = self.resources + node_list = resources.local_nodelist + num_workers = resources.num_workers + local_node_count = resources.local_node_count - if num_workers > local_node_count: - workers_per_node = self.resources.workers_per_node - cores_avail_per_node_per_worker = \ - cores_avail_per_node//workers_per_node - else: - cores_avail_per_node_per_worker = cores_avail_per_node + cores_avail_per_node = \ + (resources.logical_cores_avail_per_node if hyperthreads else + resources.physical_cores_avail_per_node) + workers_per_node = \ + (resources.workers_per_node if num_workers > local_node_count else 1) + cores_avail_per_node_per_worker = cores_avail_per_node//workers_per_node jassert(node_list, "Node list is empty - aborting") @@ -514,11 +508,7 @@ def get_resources(self, num_procs=None, num_nodes=None, format(num_nodes, ranks_per_node)) elif not num_nodes and not ranks_per_node: num_nodes = local_node_count - #Here is where really want a compact/scatter option - go for - #scatter (could get cores and say if less than one node - but then - #hyperthreads complication if no psutil installed) elif not num_procs and not ranks_per_node: - #Who would just put num_nodes??? ranks_per_node = cores_avail_per_node_per_worker elif not num_procs and not num_nodes: num_nodes = local_node_count From a5a0469afe40852592696feb145b6130410dc513 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Fri, 7 Sep 2018 21:24:49 -0500 Subject: [PATCH 047/101] Split MPI resource partitioning routines from controller to mpi_resources. --- libensemble/balsam_controller.py | 10 +- libensemble/controller.py | 176 +++---------------------------- libensemble/mpi_resources.py | 160 ++++++++++++++++++++++++++++ 3 files changed, 184 insertions(+), 162 deletions(-) create mode 100644 libensemble/mpi_resources.py diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py index f2af70403..6d014d758 100644 --- a/libensemble/balsam_controller.py +++ b/libensemble/balsam_controller.py @@ -15,11 +15,11 @@ import logging import time -from libensemble.resources import Resources +from libensemble.mpi_resources import MPIResources from libensemble.controller import \ Job, JobController, JobControllerException, jassert, STATES -logger = logging.getLogger(__name__ + '(' + Resources.get_my_name() + ')') +logger = logging.getLogger(__name__ + '(' + MPIResources.get_my_name() + ')') #For debug messages in this module - uncomment #(see libE.py to change root logging level) #logger.setLevel(logging.DEBUG) @@ -191,7 +191,11 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, #for static allocation - but Balsam does allow dynamic allocation if too large!! #For now allow user to specify - but default is True.... if self.auto_resources: - num_procs, num_nodes, ranks_per_node = self.get_resources(num_procs=num_procs, num_nodes=num_nodes, ranks_per_node=ranks_per_node, hyperthreads=hyperthreads) + num_procs, num_nodes, ranks_per_node = \ + self.resources.get_resources( + num_procs=num_procs, + num_nodes=num_nodes, ranks_per_node=ranks_per_node, + hyperthreads=hyperthreads) else: #Without resource detection (note: not included machinefile option) num_procs, num_nodes, ranks_per_node = \ diff --git a/libensemble/controller.py b/libensemble/controller.py index c2e0e4135..c5319d7a7 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python - """ Module to launch and control running jobs. @@ -18,9 +16,9 @@ import libensemble.launcher as launcher from libensemble.register import Register -from libensemble.resources import Resources +from libensemble.mpi_resources import MPIResources -logger = logging.getLogger(__name__ + '(' + Resources.get_my_name() + ')') +logger = logging.getLogger(__name__ + '(' + MPIResources.get_my_name() + ')') #For debug messages in this module - uncomment #(see libE.py to change root logging level) #logger.setLevel(logging.DEBUG) @@ -209,35 +207,6 @@ class JobController: controller = None - @staticmethod - def job_partition(num_procs, num_nodes, ranks_per_node, machinefile=None): - """Takes provided nprocs/nodes/ranks and outputs working - configuration of procs/nodes/ranks or error""" - - #If machinefile is provided - ignore everything else - if machinefile: - if num_procs or num_nodes or ranks_per_node: - logger.warning("Machinefile provided - overriding " - "procs/nodes/ranks_per_node") - return None, None, None - - if not num_procs: - jassert(num_nodes and ranks_per_node, - "Need num_procs, num_nodes/ranks_per_node, or machinefile") - num_procs = num_nodes * ranks_per_node - - elif not num_nodes: - ranks_per_node = ranks_per_node or num_procs - num_nodes = num_procs//ranks_per_node - - elif not ranks_per_node: - ranks_per_node = num_procs//num_nodes - - jassert(num_procs == num_nodes*ranks_per_node, - "num_procs does not equal num_nodes*ranks_per_node") - return num_procs, num_nodes, ranks_per_node - - def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, nodelist_env_cobalt=None): """Instantiate a new JobController instance. @@ -280,9 +249,10 @@ def __init__(self, registry=None, auto_resources=True, self.manager_signal = 'none' if self.auto_resources: - self.resources = Resources(top_level_dir=self.top_level_dir, - nodelist_env_slurm=nodelist_env_slurm, - nodelist_env_cobalt=nodelist_env_cobalt) + self.resources = \ + MPIResources(top_level_dir=self.top_level_dir, + nodelist_env_slurm=nodelist_env_slurm, + nodelist_env_cobalt=nodelist_env_cobalt) mpi_commands = { 'mpich': ['mpirun', '--env {env}', '-machinefile {machinefile}', @@ -292,7 +262,7 @@ def __init__(self, registry=None, auto_resources=True, '-host {hostlist}', '-np {num_procs}', '-npernode {ranks_per_node}'], } - self.mpi_command = mpi_commands[Resources.get_MPI_variant()] + self.mpi_command = mpi_commands[MPIResources.get_MPI_variant()] self.wait_time = 60 self.list_of_jobs = [] self.workerID = None @@ -373,27 +343,29 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, #kludging this for now - not nec machinefile if more than one node #- try a hostlist num_procs, num_nodes, ranks_per_node = \ - self.get_resources(num_procs=num_procs, num_nodes=num_nodes, - ranks_per_node=ranks_per_node, - hyperthreads=hyperthreads) + self.resources.get_resources( + num_procs=num_procs, + num_nodes=num_nodes, ranks_per_node=ranks_per_node, + hyperthreads=hyperthreads) if num_nodes > 1: #hostlist - hostlist = self.get_hostlist() + hostlist = self.resources.get_hostlist() else: #machinefile machinefile = "machinefile_autogen" if self.workerID is not None: machinefile += "_for_worker_{}".format(self.workerID) mfile_created, num_procs, num_nodes, ranks_per_node = \ - self.create_machinefile(machinefile, num_procs, num_nodes, - ranks_per_node, hyperthreads) + self.resources.create_machinefile( + machinefile, num_procs, num_nodes, + ranks_per_node, hyperthreads) jassert(mfile_created, "Auto-creation of machinefile failed") else: num_procs, num_nodes, ranks_per_node = \ - JobController.job_partition(num_procs, num_nodes, - ranks_per_node, machinefile) + MPIResources.job_partition(num_procs, num_nodes, + ranks_per_node, machinefile) default_workdir = os.getcwd() job = Job(app, app_args, default_workdir, stdout, stderr, self.workerID) @@ -455,7 +427,6 @@ def manager_poll(self): logger.warning("Received unrecognized manager signal {} - " "ignoring".format(man_signal)) - def get_job(self, jobid): """ Returns the job object for the supplied job ID """ job = next((j for j in self.list_of_jobs if j.id == jobid), None) @@ -463,123 +434,10 @@ def get_job(self, jobid): logger.warning("Job {} not found in joblist".format(jobid)) return job - def set_workerID(self, workerid): """Sets the worker ID for this job_controller""" self.workerID = workerid - - #Reformat create_machinefile to use this and also use this for - #non-machinefile cases when auto-detecting - def get_resources(self, num_procs=None, num_nodes=None, - ranks_per_node=None, hyperthreads=False): - """Reconciles user supplied options with available Worker - resources to produce run configuration. - - Detects resources available to worker, checks if an existing - user supplied config is valid, and fills in any missing config - information (ie. num_procs/num_nodes/ranks_per_node) - - User supplied config options are honoured, and an exception is - raised if these are infeasible. - """ - - resources = self.resources - node_list = resources.local_nodelist - num_workers = resources.num_workers - local_node_count = resources.local_node_count - - cores_avail_per_node = \ - (resources.logical_cores_avail_per_node if hyperthreads else - resources.physical_cores_avail_per_node) - workers_per_node = \ - (resources.workers_per_node if num_workers > local_node_count else 1) - cores_avail_per_node_per_worker = cores_avail_per_node//workers_per_node - - jassert(node_list, "Node list is empty - aborting") - - #If no decomposition supplied - use all available cores/nodes - if not num_procs and not num_nodes and not ranks_per_node: - num_nodes = local_node_count - ranks_per_node = cores_avail_per_node_per_worker - logger.debug("No decomposition supplied - " - "using all available resource. " - "Nodes: {} ranks_per_node {}". - format(num_nodes, ranks_per_node)) - elif not num_nodes and not ranks_per_node: - num_nodes = local_node_count - elif not num_procs and not ranks_per_node: - ranks_per_node = cores_avail_per_node_per_worker - elif not num_procs and not num_nodes: - num_nodes = local_node_count - - #checks config is consistent and sufficient to express - - #does not check actual resources - num_procs, num_nodes, ranks_per_node = \ - JobController.job_partition(num_procs, num_nodes, ranks_per_node) - - #Could just downgrade to those available with warning - for now error - jassert(num_nodes <= local_node_count, - "Not enough nodes to honour arguments. " - "Requested {}. Only {} available". - format(num_nodes, local_node_count)) - - jassert(ranks_per_node <= cores_avail_per_node, - "Not enough processors on a node to honour arguments. " - "Requested {}. Only {} available". - format(ranks_per_node, cores_avail_per_node)) - - jassert(ranks_per_node <= cores_avail_per_node_per_worker, - "Not enough processors per worker to honour arguments. " - "Requested {}. Only {} available". - format(ranks_per_node, cores_avail_per_node_per_worker)) - - jassert(num_procs <= (cores_avail_per_node * local_node_count), - "Not enough procs to honour arguments. " - "Requested {}. Only {} available". - format(num_procs, cores_avail_per_node*local_node_count)) - - if num_nodes < local_node_count: - logger.warning("User constraints mean fewer nodes being used " - "than available. {} nodes used. {} nodes available". - format(num_nodes, local_node_count)) - - return num_procs, num_nodes, ranks_per_node - - - def create_machinefile(self, machinefile=None, num_procs=None, - num_nodes=None, ranks_per_node=None, - hyperthreads=False): - """Create a machinefile based on user supplied config options, - completed by detected machine resources""" - - machinefile = machinefile or 'machinefile' - if os.path.isfile(machinefile): - try: - os.remove(machinefile) - except: - pass - - node_list = self.resources.local_nodelist - logger.debug("Creating machinefile with {} nodes and {} ranks per node". - format(num_nodes, ranks_per_node)) - - with open(machinefile, 'w') as f: - for node in node_list[:num_nodes]: - f.write((node + '\n') * ranks_per_node) - - built_mfile = (os.path.isfile(machinefile) - and os.path.getsize(machinefile) > 0) - return built_mfile, num_procs, num_nodes, ranks_per_node - - - def get_hostlist(self): - """Create a hostlist based on user supplied config options, - completed by detected machine resources""" - node_list = self.resources.local_nodelist - hostlist_str = ",".join([str(x) for x in node_list]) - return hostlist_str - def kill(self, job): "Kill a job" jassert(isinstance(job, Job), "Invalid job has been provided") diff --git a/libensemble/mpi_resources.py b/libensemble/mpi_resources.py new file mode 100644 index 000000000..ccd4bce69 --- /dev/null +++ b/libensemble/mpi_resources.py @@ -0,0 +1,160 @@ +""" +Manage libensemble resources related to MPI jobs launched from nodes. +""" + +import os +import logging + +from libensemble.resources import Resources, ResourcesException + +def rassert(test, *args): + if not test: + raise ResourcesException(*args) + + +logger = logging.getLogger(__name__) +#For debug messages in this module - uncomment +#(see libE.py to change root logging level) +#logger.setLevel(logging.DEBUG) + + +class MPIResources(Resources): + + @staticmethod + def job_partition(num_procs, num_nodes, ranks_per_node, machinefile=None): + """Takes provided nprocs/nodes/ranks and outputs working + configuration of procs/nodes/ranks or error""" + + #If machinefile is provided - ignore everything else + if machinefile: + if num_procs or num_nodes or ranks_per_node: + logger.warning("Machinefile provided - overriding " + "procs/nodes/ranks_per_node") + return None, None, None + + if not num_procs: + rassert(num_nodes and ranks_per_node, + "Need num_procs, num_nodes/ranks_per_node, or machinefile") + num_procs = num_nodes * ranks_per_node + + elif not num_nodes: + ranks_per_node = ranks_per_node or num_procs + num_nodes = num_procs//ranks_per_node + + elif not ranks_per_node: + ranks_per_node = num_procs//num_nodes + + rassert(num_procs == num_nodes*ranks_per_node, + "num_procs does not equal num_nodes*ranks_per_node") + return num_procs, num_nodes, ranks_per_node + + + #Reformat create_machinefile to use this and also use this for + #non-machinefile cases when auto-detecting + def get_resources(self, num_procs=None, num_nodes=None, + ranks_per_node=None, hyperthreads=False): + """Reconciles user supplied options with available Worker + resources to produce run configuration. + + Detects resources available to worker, checks if an existing + user supplied config is valid, and fills in any missing config + information (ie. num_procs/num_nodes/ranks_per_node) + + User supplied config options are honoured, and an exception is + raised if these are infeasible. + """ + + node_list = self.local_nodelist + num_workers = self.num_workers + local_node_count = self.local_node_count + + cores_avail_per_node = \ + (self.logical_cores_avail_per_node if hyperthreads else + self.physical_cores_avail_per_node) + workers_per_node = \ + (self.workers_per_node if num_workers > local_node_count else 1) + cores_avail_per_node_per_worker = cores_avail_per_node//workers_per_node + + rassert(node_list, "Node list is empty - aborting") + + #If no decomposition supplied - use all available cores/nodes + if not num_procs and not num_nodes and not ranks_per_node: + num_nodes = local_node_count + ranks_per_node = cores_avail_per_node_per_worker + logger.debug("No decomposition supplied - " + "using all available resource. " + "Nodes: {} ranks_per_node {}". + format(num_nodes, ranks_per_node)) + elif not num_nodes and not ranks_per_node: + num_nodes = local_node_count + elif not num_procs and not ranks_per_node: + ranks_per_node = cores_avail_per_node_per_worker + elif not num_procs and not num_nodes: + num_nodes = local_node_count + + #checks config is consistent and sufficient to express - + #does not check actual resources + num_procs, num_nodes, ranks_per_node = \ + MPIResources.job_partition(num_procs, num_nodes, ranks_per_node) + + #Could just downgrade to those available with warning - for now error + rassert(num_nodes <= local_node_count, + "Not enough nodes to honour arguments. " + "Requested {}. Only {} available". + format(num_nodes, local_node_count)) + + rassert(ranks_per_node <= cores_avail_per_node, + "Not enough processors on a node to honour arguments. " + "Requested {}. Only {} available". + format(ranks_per_node, cores_avail_per_node)) + + rassert(ranks_per_node <= cores_avail_per_node_per_worker, + "Not enough processors per worker to honour arguments. " + "Requested {}. Only {} available". + format(ranks_per_node, cores_avail_per_node_per_worker)) + + rassert(num_procs <= (cores_avail_per_node * local_node_count), + "Not enough procs to honour arguments. " + "Requested {}. Only {} available". + format(num_procs, cores_avail_per_node*local_node_count)) + + if num_nodes < local_node_count: + logger.warning("User constraints mean fewer nodes being used " + "than available. {} nodes used. {} nodes available". + format(num_nodes, local_node_count)) + + return num_procs, num_nodes, ranks_per_node + + + def create_machinefile(self, machinefile=None, num_procs=None, + num_nodes=None, ranks_per_node=None, + hyperthreads=False): + """Create a machinefile based on user supplied config options, + completed by detected machine resources""" + + machinefile = machinefile or 'machinefile' + if os.path.isfile(machinefile): + try: + os.remove(machinefile) + except: + pass + + node_list = self.local_nodelist + logger.debug("Creating machinefile with {} nodes and {} ranks per node". + format(num_nodes, ranks_per_node)) + + with open(machinefile, 'w') as f: + for node in node_list[:num_nodes]: + f.write((node + '\n') * ranks_per_node) + + built_mfile = (os.path.isfile(machinefile) + and os.path.getsize(machinefile) > 0) + return built_mfile, num_procs, num_nodes, ranks_per_node + + + def get_hostlist(self): + """Create a hostlist based on user supplied config options, + completed by detected machine resources""" + node_list = self.local_nodelist + hostlist_str = ",".join([str(x) for x in node_list]) + return hostlist_str From 842b90c8430386b251ade29aa75145b513952132 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Sat, 8 Sep 2018 10:52:04 -0500 Subject: [PATCH 048/101] Split JobController into JobController (base class) + MPIJobController. NB: If we want to have a serial job controller, this is probably the sensible way of doing things... --- libensemble/balsam_controller.py | 6 +- libensemble/controller.py | 132 +++++++++++------- libensemble/sim_funcs/job_control_hworld.py | 4 +- .../test_jobcontroller.manager_poll.py | 2 +- .../controller_tests/test_jobcontroller.py | 2 +- .../test_jobcontroller_multi.py | 4 +- .../tests/regression_tests/test_comms.py | 4 +- .../test_jobcontroller_hworld.py | 4 +- .../tests/unit_tests/test_job_funcs.py | 4 +- .../tests/unit_tests/test_jobcontroller.py | 10 +- 10 files changed, 98 insertions(+), 74 deletions(-) diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py index 6d014d758..46f28a1e5 100644 --- a/libensemble/balsam_controller.py +++ b/libensemble/balsam_controller.py @@ -17,7 +17,7 @@ from libensemble.mpi_resources import MPIResources from libensemble.controller import \ - Job, JobController, JobControllerException, jassert, STATES + Job, MPIJobController, JobControllerException, jassert, STATES logger = logging.getLogger(__name__ + '(' + MPIResources.get_my_name() + ')') #For debug messages in this module - uncomment @@ -136,7 +136,7 @@ def kill(self, wait_time=None): self.calc_job_timing() -class BalsamJobController(JobController): +class BalsamJobController(MPIJobController): """Inherits from JobController and wraps the Balsam job management service .. note:: Job kills are not configurable in the Balsam job_controller. @@ -199,7 +199,7 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, else: #Without resource detection (note: not included machinefile option) num_procs, num_nodes, ranks_per_node = \ - JobController.job_partition(num_procs, num_nodes, ranks_per_node) + MPIResources.job_partition(num_procs, num_nodes, ranks_per_node) #temp - while balsam does not accept a standard out name if stdout is not None or stderr is not None: diff --git a/libensemble/controller.py b/libensemble/controller.py index c5319d7a7..6913a689c 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -207,6 +207,83 @@ class JobController: controller = None + def __init__(self, registry=None): + """Instantiate a new JobController instance. + + A new JobController object is created with an application + registry and configuration attributes. A registry object must + have been created. + + This is typically created in the user calling script. If + auto_resources is True, an evaluation of system resources is + performance during this call. + + Parameters + ---------- + registry: obj: Registry, optional + A registry containing the applications to use in this + job_controller (Default: Use Register.default_registry). + """ + + self.registry = registry or Register.default_registry + jassert(self.registry is not None, "Cannot find default registry") + + self.top_level_dir = os.getcwd() + self.manager_signal = 'none' + + self.wait_time = 60 + self.list_of_jobs = [] + self.workerID = None + JobController.controller = self + + def manager_poll(self): + """ Polls for a manager signal + + The job controller manager_signal attribute will be updated. + + """ + + #Will use MPI_MODE from settings.py but for now assume MPI + from libensemble.message_numbers import \ + STOP_TAG, MAN_SIGNAL_FINISH, MAN_SIGNAL_KILL + from mpi4py import MPI + + # Manager Signals + # Stop tag may be manager interupt as diff kill/stop/pause.... + comm = MPI.COMM_WORLD + status = MPI.Status() + if comm.Iprobe(source=0, tag=STOP_TAG, status=status): + logger.info('Manager probe hit true') + man_signal = comm.recv(source=0, tag=STOP_TAG, status=status) + if man_signal == MAN_SIGNAL_FINISH: + self.manager_signal = 'finish' + elif man_signal == MAN_SIGNAL_KILL: + self.manager_signal = 'kill' + else: + logger.warning("Received unrecognized manager signal {} - " + "ignoring".format(man_signal)) + + def get_job(self, jobid): + """ Returns the job object for the supplied job ID """ + job = next((j for j in self.list_of_jobs if j.id == jobid), None) + if job is None: + logger.warning("Job {} not found in joblist".format(jobid)) + return job + + def set_workerID(self, workerid): + """Sets the worker ID for this job_controller""" + self.workerID = workerid + + def kill(self, job): + "Kill a job" + jassert(isinstance(job, Job), "Invalid job has been provided") + job.kill(self.wait_time) + + +class MPIJobController(JobController): + """The MPI job_controller can create, poll and kill runnable MPI jobs + """ + def __init__(self, registry=None, auto_resources=True, nodelist_env_slurm=None, nodelist_env_cobalt=None): """Instantiate a new JobController instance. @@ -241,13 +318,8 @@ def __init__(self, registry=None, auto_resources=True, auto_resources=True. """ - self.registry = registry or Register.default_registry - jassert(self.registry is not None, "Cannot find default registry") - - self.top_level_dir = os.getcwd() + JobController.__init__(self, registry) self.auto_resources = auto_resources - self.manager_signal = 'none' - if self.auto_resources: self.resources = \ MPIResources(top_level_dir=self.top_level_dir, @@ -263,10 +335,6 @@ def __init__(self, registry=None, auto_resources=True, '-npernode {ranks_per_node}'], } self.mpi_command = mpi_commands[MPIResources.get_MPI_variant()] - self.wait_time = 60 - self.list_of_jobs = [] - self.workerID = None - JobController.controller = self def launch(self, calc_type, num_procs=None, num_nodes=None, @@ -398,47 +466,3 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, self.list_of_jobs.append(job) return job - - - def manager_poll(self): - """ Polls for a manager signal - - The job controller manager_signal attribute will be updated. - - """ - - #Will use MPI_MODE from settings.py but for now assume MPI - from libensemble.message_numbers import \ - STOP_TAG, MAN_SIGNAL_FINISH, MAN_SIGNAL_KILL - from mpi4py import MPI - - # Manager Signals - # Stop tag may be manager interupt as diff kill/stop/pause.... - comm = MPI.COMM_WORLD - status = MPI.Status() - if comm.Iprobe(source=0, tag=STOP_TAG, status=status): - logger.info('Manager probe hit true') - man_signal = comm.recv(source=0, tag=STOP_TAG, status=status) - if man_signal == MAN_SIGNAL_FINISH: - self.manager_signal = 'finish' - elif man_signal == MAN_SIGNAL_KILL: - self.manager_signal = 'kill' - else: - logger.warning("Received unrecognized manager signal {} - " - "ignoring".format(man_signal)) - - def get_job(self, jobid): - """ Returns the job object for the supplied job ID """ - job = next((j for j in self.list_of_jobs if j.id == jobid), None) - if job is None: - logger.warning("Job {} not found in joblist".format(jobid)) - return job - - def set_workerID(self, workerid): - """Sets the worker ID for this job_controller""" - self.workerID = workerid - - def kill(self, job): - "Kill a job" - jassert(isinstance(job, Job), "Invalid job has been provided") - job.kill(self.wait_time) diff --git a/libensemble/sim_funcs/job_control_hworld.py b/libensemble/sim_funcs/job_control_hworld.py index 77ef903f1..d082d1dd8 100644 --- a/libensemble/sim_funcs/job_control_hworld.py +++ b/libensemble/sim_funcs/job_control_hworld.py @@ -1,4 +1,4 @@ -from libensemble.controller import JobController +from libensemble.controller import MPIJobController from libensemble.message_numbers import * import numpy as np @@ -64,7 +64,7 @@ def polling_loop(jobctl, job, timeout_sec=6.0, delay=1.0): def job_control_hworld(H, persis_info, sim_specs, _): """ Test of launching and polling job and exiting on job finish""" - jobctl = JobController.controller + jobctl = MPIJobController.controller cores = sim_specs['cores'] args_for_sim = 'sleep 3' diff --git a/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py b/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py index a6564a2d9..9b7ec8dd5 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py +++ b/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py @@ -45,7 +45,7 @@ def build_simfunc(): jobctrl = BalsamJobController(registry = registry) else: registry = Register() - jobctrl = JobController(registry = registry) + jobctrl = MPIJobController(registry = registry) registry.register_calc(full_path=sim_app, calc_type='sim') diff --git a/libensemble/tests/controller_tests/test_jobcontroller.py b/libensemble/tests/controller_tests/test_jobcontroller.py index 5e9da2e1a..b069fe404 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller.py +++ b/libensemble/tests/controller_tests/test_jobcontroller.py @@ -36,7 +36,7 @@ def build_simfunc(): jobctrl = BalsamJobController(registry = registry) else: registry = Register() - jobctrl = JobController(registry = registry) + jobctrl = MPIJobController(registry = registry) registry.register_calc(full_path=sim_app, calc_type='sim') diff --git a/libensemble/tests/controller_tests/test_jobcontroller_multi.py b/libensemble/tests/controller_tests/test_jobcontroller_multi.py index 4bb6c1eb3..40a806390 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller_multi.py +++ b/libensemble/tests/controller_tests/test_jobcontroller_multi.py @@ -18,7 +18,7 @@ def build_simfunc(): #--------------- Calling script --------------------------------------------------------------- from libensemble.register import Register, BalsamRegister -from libensemble.controller import JobController +from libensemble.controller import JobController, MPIJobController from libensemble.baslam_controller import BalsamJobController #sim_app = 'simdir/my_simjob.x' @@ -39,7 +39,7 @@ def build_simfunc(): jobctrl = BalsamJobController(registry = registry) else: registry = Register() - jobctrl = JobController(registry = registry) + jobctrl = MPIJobController(registry = registry) registry.register_calc(full_path=sim_app, calc_type='sim') diff --git a/libensemble/tests/regression_tests/test_comms.py b/libensemble/tests/regression_tests/test_comms.py index fa277a73f..2c881c4d8 100644 --- a/libensemble/tests/regression_tests/test_comms.py +++ b/libensemble/tests/regression_tests/test_comms.py @@ -34,11 +34,11 @@ from libensemble.sim_funcs.comms_testing import float_x1000 from libensemble.gen_funcs.uniform_sampling import uniform_random_sample from libensemble.register import Register #Only being used to pass workerID -from libensemble.controller import JobController #Only being used to pass workerID +from libensemble.controller import JobController, MPIJobController #Only being used to pass workerID from libensemble.resources import Resources #Only to get number of workers registry = Register() -jobctrl = JobController(registry = registry, auto_resources = False) +jobctrl = MPIJobController(registry = registry, auto_resources = False) #registry.register_calc(full_path=sim_app, calc_type='sim') #Test with no app registered. num_workers = Resources.get_num_workers() diff --git a/libensemble/tests/regression_tests/test_jobcontroller_hworld.py b/libensemble/tests/regression_tests/test_jobcontroller_hworld.py index 483b3d403..f58291acc 100644 --- a/libensemble/tests/regression_tests/test_jobcontroller_hworld.py +++ b/libensemble/tests/regression_tests/test_jobcontroller_hworld.py @@ -10,7 +10,7 @@ from libensemble.sim_funcs.job_control_hworld import job_control_hworld from libensemble.gen_funcs.uniform_sampling import uniform_random_sample from libensemble.register import Register, BalsamRegister -from libensemble.controller import JobController +from libensemble.controller import JobController, MPIJobController from libensemble.balsam_controller import BalsamJobController from libensemble.calc_info import CalcInfo from libensemble.resources import Resources @@ -40,7 +40,7 @@ def build_simfunc(): jobctrl = BalsamJobController(registry = registry, auto_resources = True) else: registry = Register() - jobctrl = JobController(registry = registry, auto_resources = True) + jobctrl = MPIJobController(registry = registry, auto_resources = True) registry.register_calc(full_path=sim_app, calc_type='sim') summary_file_name = short_name + '.libe_summary.txt' diff --git a/libensemble/tests/unit_tests/test_job_funcs.py b/libensemble/tests/unit_tests/test_job_funcs.py index 8c0c8d5f8..3b981b006 100644 --- a/libensemble/tests/unit_tests/test_job_funcs.py +++ b/libensemble/tests/unit_tests/test_job_funcs.py @@ -2,7 +2,7 @@ import shutil from libensemble.register import Register -from libensemble.controller import Job, JobController, JobControllerException +from libensemble.controller import Job, JobController, MPIJobController, JobControllerException def setup_module(module): print ("setup_module module:%s" % module.__name__) @@ -41,7 +41,7 @@ def teardown_module(module): def test_job_funcs(): dummyappname = os.getcwd() + '/myapp.x' registry = Register() - jobctrl = JobController(registry = registry, auto_resources = False) + jobctrl = MPIJobController(registry = registry, auto_resources = False) registry.register_calc(full_path=dummyappname, calc_type='gen', desc='A dummy calc') registry.register_calc(full_path=dummyappname, calc_type='sim', desc='A dummy calc') diff --git a/libensemble/tests/unit_tests/test_jobcontroller.py b/libensemble/tests/unit_tests/test_jobcontroller.py index 9db3fadeb..229b9fe22 100644 --- a/libensemble/tests/unit_tests/test_jobcontroller.py +++ b/libensemble/tests/unit_tests/test_jobcontroller.py @@ -7,7 +7,7 @@ import pytest import socket from libensemble.register import Register, BalsamRegister -from libensemble.controller import JobController +from libensemble.controller import JobController, MPIJobController from libensemble.balsam_controller import BalsamJobController USE_BALSAM = False @@ -76,7 +76,7 @@ def setup_job_controller(): jobctrl = BalsamJobController(registry = registry, auto_resources = False) else: registry = Register() - jobctrl = JobController(registry = registry, auto_resources = False) + jobctrl = MPIJobController(registry = registry, auto_resources = False) registry.register_calc(full_path=sim_app, calc_type='sim') @@ -90,7 +90,7 @@ def setup_job_controller_noreg(): jobctrl = BalsamJobController(auto_resources = False) else: registry = Register() - jobctrl = JobController(auto_resources = False) + jobctrl = MPIJobController(auto_resources = False) registry.register_calc(full_path=sim_app, calc_type='sim') @@ -104,7 +104,7 @@ def setup_job_controller_noapp(): jobctrl = BalsamJobController(registry = registry, auto_resources = False) else: registry = Register() - jobctrl = JobController(registry = registry, auto_resources = False) + jobctrl = MPIJobController(registry = registry, auto_resources = False) # ----------------------------------------------------------------------------- # The following would typically be in the user sim_func @@ -436,7 +436,7 @@ def test_create_jobcontroller_no_registry(): args_for_sim = 'sleep 0.1' #import pdb;pdb.set_trace() try: - jobctrl = JobController(auto_resources = False) + jobctrl = MPIJobController(auto_resources = False) except: assert 1 else: From fd843458c52dbc060cc2cdf6ac2f7bc98c9001db Mon Sep 17 00:00:00 2001 From: David Bindel Date: Sun, 9 Sep 2018 17:15:28 -0500 Subject: [PATCH 049/101] Common controller.default_app with error check; split _get_mpi_specs helper. --- libensemble/balsam_controller.py | 7 +-- libensemble/controller.py | 98 ++++++++++++++++++-------------- 2 files changed, 56 insertions(+), 49 deletions(-) diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py index 46f28a1e5..62496494e 100644 --- a/libensemble/balsam_controller.py +++ b/libensemble/balsam_controller.py @@ -165,12 +165,7 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, """ import balsam.launcher.dag as dag - app = self.registry.default_app(calc_type) - jassert(calc_type in ['sim', 'gen'], - "Unrecognized calculation type", calc_type) - jassert(app, "Default {} app is not set".format(calc_type)) - - #-------- Up to here should be common - can go in a baseclass ------# + app = self.default_app(calc_type) #Need test somewhere for if no breakdown supplied.... #or only machinefile diff --git a/libensemble/controller.py b/libensemble/controller.py index 6913a689c..ebbd6f0bc 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -33,9 +33,12 @@ FAILED""".split() -class JobControllerException(Exception): pass +class JobControllerException(Exception): + "Raised for any exception in the JobController" + pass def jassert(test, *args): + "Version of assert that raises a JobControllerException" if not test: raise JobControllerException(*args) @@ -236,6 +239,14 @@ def __init__(self, registry=None): self.workerID = None JobController.controller = self + def default_app(self, calc_type): + "Get the default app for a given calc type." + app = self.registry.default_app(calc_type) + jassert(calc_type in ['sim', 'gen'], + "Unrecognized calculation type", calc_type) + jassert(app, "Default {} app is not set".format(calc_type)) + return app + def manager_poll(self): """ Polls for a manager signal @@ -337,6 +348,46 @@ def __init__(self, registry=None, auto_resources=True, self.mpi_command = mpi_commands[MPIResources.get_MPI_variant()] + def _get_mpi_specs(self, num_procs, num_nodes, ranks_per_node, + machinefile, hyperthreads): + "Form the mpi_specs dictionary." + hostlist = None + if machinefile is None and self.auto_resources: + + #kludging this for now - not nec machinefile if more than one node + #- try a hostlist + num_procs, num_nodes, ranks_per_node = \ + self.resources.get_resources( + num_procs=num_procs, + num_nodes=num_nodes, ranks_per_node=ranks_per_node, + hyperthreads=hyperthreads) + + if num_nodes > 1: + #hostlist + hostlist = self.resources.get_hostlist() + else: + #machinefile + machinefile = "machinefile_autogen" + if self.workerID is not None: + machinefile += "_for_worker_{}".format(self.workerID) + mfile_created, num_procs, num_nodes, ranks_per_node = \ + self.resources.create_machinefile( + machinefile, num_procs, num_nodes, + ranks_per_node, hyperthreads) + jassert(mfile_created, "Auto-creation of machinefile failed") + + else: + num_procs, num_nodes, ranks_per_node = \ + MPIResources.job_partition(num_procs, num_nodes, + ranks_per_node, machinefile) + + return {'num_procs': num_procs, + 'num_nodes': num_nodes, + 'ranks_per_node': ranks_per_node, + 'machinefile': machinefile, + 'hostlist': hostlist} + + def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, app_args=None, stdout=None, stderr=None, stage_inout=None, @@ -398,43 +449,7 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, then the available resources will be divided amongst workers. """ - app = self.registry.default_app(calc_type) - jassert(calc_type in ['sim', 'gen'], - "Unrecognized calculation type", calc_type) - jassert(app, "Default {} app is not set".format(calc_type)) - - - #-------- Up to here should be common - can go in a baseclass ------# - hostlist = None - if machinefile is None and self.auto_resources: - - #kludging this for now - not nec machinefile if more than one node - #- try a hostlist - num_procs, num_nodes, ranks_per_node = \ - self.resources.get_resources( - num_procs=num_procs, - num_nodes=num_nodes, ranks_per_node=ranks_per_node, - hyperthreads=hyperthreads) - - if num_nodes > 1: - #hostlist - hostlist = self.resources.get_hostlist() - else: - #machinefile - machinefile = "machinefile_autogen" - if self.workerID is not None: - machinefile += "_for_worker_{}".format(self.workerID) - mfile_created, num_procs, num_nodes, ranks_per_node = \ - self.resources.create_machinefile( - machinefile, num_procs, num_nodes, - ranks_per_node, hyperthreads) - jassert(mfile_created, "Auto-creation of machinefile failed") - - else: - num_procs, num_nodes, ranks_per_node = \ - MPIResources.job_partition(num_procs, num_nodes, - ranks_per_node, machinefile) - + app = self.default_app(calc_type) default_workdir = os.getcwd() job = Job(app, app_args, default_workdir, stdout, stderr, self.workerID) @@ -442,11 +457,8 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, logger.warning("stage_inout option ignored in this " "job_controller - runs in-place") - mpi_specs = {'num_procs': num_procs, - 'num_nodes': num_nodes, - 'ranks_per_node': ranks_per_node, - 'machinefile': machinefile, - 'hostlist': hostlist} + mpi_specs = self._get_mpi_specs(num_procs, num_nodes, ranks_per_node, + machinefile, hyperthreads) runline = launcher.form_command(self.mpi_command, mpi_specs) runline.append(job.app.full_path) if job.app_args is not None: From 9729cdbcd7fe59011a96509e71ca3cf3cccc0f22 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Mon, 10 Sep 2018 09:50:07 -0500 Subject: [PATCH 050/101] Factored error checking at start of poll into a base class method. --- libensemble/balsam_controller.py | 14 +------------- libensemble/controller.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py index 62496494e..fc4639ed4 100644 --- a/libensemble/balsam_controller.py +++ b/libensemble/balsam_controller.py @@ -67,21 +67,9 @@ def calc_job_timing(self): def poll(self): """Polls and updates the status attributes of the supplied job""" - jassert(isinstance(self, BalsamJob), "Invalid job has been provided") - - # Check the jobs been launched (i.e. it has a process ID) - #Prob should be recoverable and return state - but currently fatal - jassert(self.process, - "Polled job has no process ID - check jobs been launched") - - # Do not poll if job already finished - if self.finished: - logger.warning("Polled job has already finished. Not re-polling. " - "Status is {}".format(self.state)) + if not self.check_poll(): return - #-------- Up to here should be common - can go in a baseclass ------# - # Get current state of jobs from Balsam database self.process.refresh_from_db() balsam_state = self.process.state diff --git a/libensemble/controller.py b/libensemble/controller.py index ebbd6f0bc..993a6fe16 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -136,9 +136,8 @@ def calc_job_timing(self): if self.total_time is None: self.total_time = self.runtime - def poll(self): - """Polls and updates the status attributes of the job""" - + def check_poll(self): + """Check whether polling this job makes sense.""" jassert(self.process is not None, "Polled job {} has no process ID - check jobs been launched". format(self.name)) @@ -146,9 +145,14 @@ def poll(self): logger.warning("Polled job {} has already finished. " "Not re-polling. Status is {}". format(self.name, self.state)) - return + return False + return True - #-------- Up to here should be common - can go in a baseclass ------# + + def poll(self): + """Polls and updates the status attributes of the job""" + if not self.check_poll(): + return # Poll the job poll = self.process.poll() From 87f9ba335ac0d8fab3415919a84549ef468dac16 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Mon, 10 Sep 2018 10:51:12 -0500 Subject: [PATCH 051/101] Remove (never-used) default flags in Register. --- libensemble/register.py | 131 ++++++++++++++++------------------------ 1 file changed, 51 insertions(+), 80 deletions(-) diff --git a/libensemble/register.py b/libensemble/register.py index 2dbc12145..461b668cc 100644 --- a/libensemble/register.py +++ b/libensemble/register.py @@ -7,38 +7,44 @@ from mpi4py import MPI logger = logging.getLogger(__name__) -#For debug messages in this module - uncomment (see libE.py to change root logging level) +#For debug messages in this module - uncomment +#(see libE.py to change root logging level) #logger.setLevel(logging.DEBUG) -class RegistrationException(Exception): pass +class RegistrationException(Exception): + "Raised for any exception in the Register" + pass +def rassert(check, *args): + "Version of assert that raises a RegistrationException" + if not check: + raise RegistrationException(*args) -class Application: - '''An application is an executable user-program (e.g. Implementing a sim/gen function).''' +class Application: + """An application is an executable user-program + (e.g. Implementing a sim/gen function).""" - def __init__(self, full_path, calc_type='sim', desc=None, default=True): - '''Instantiate a new Application instance.''' + def __init__(self, full_path, calc_type='sim', desc=None): + """Instantiate a new Application instance.""" self.full_path = full_path self.calc_type = calc_type - self.default = default self.calc_dir, self.exe = os.path.split(full_path) - #Dont change:Why? - cos will use this name to delete jobs in database - see del_apps(), del_jobs() + # Use this name to delete jobs in database - see del_apps(), del_jobs() self.name = self.exe + '.' + self.calc_type + 'func' self.desc = desc or (self.exe + ' ' + self.calc_type + ' function') #May merge this into job_controller class Register(): - - '''Registers and stores user applications + """Registers and stores user applications Attributes ---------- default_registry : Obj: Register or inherited class. A class attribute holding the default registry. - ''' + """ default_registry = None @@ -56,25 +62,13 @@ def default_app(self, calc_type): """Return the default calc_type app.""" return self._default_apps.get(calc_type) - def __init__(self, default=True): - '''Instantiate a new Register instance - - Parameters - ---------- - - default: Boolean, optional - Make this the default_registry (Default is True) - - - Note: Currently, only a default registry is supported. - - ''' + def __init__(self): + """Instantiate a new Register instance.""" self._default_apps = {'sim' : None, 'gen': None} - if default: - Register.default_registry = self + Register.default_registry = self - def register_calc(self, full_path, calc_type='sim', desc=None, default=True): - '''Registers a user application to libEnsemble + def register_calc(self, full_path, calc_type='sim', desc=None): + """Registers a user application to libEnsemble Parameters ---------- @@ -83,33 +77,27 @@ def register_calc(self, full_path, calc_type='sim', desc=None, default=True): The full path of the user application to be registered. calc_type: String - Calculation type: Is this application part of a 'sim' or 'gen' function. + Calculation type: Is this application part of a 'sim' + or 'gen' function. desc: String, optional Description of this application. - default: Boolean, optional - Register to the default_registry (Default is True). - - - ''' - if not default: - return # Always default currently - if calc_type not in self._default_apps: - raise RegistrationException("Unrecognized calculation type", calc_type) - if self._default_apps[calc_type] is not None: - raise RegistrationException("Default {} app already set".format(calc_type)) - self._default_apps[calc_type] = Application(full_path, calc_type, desc, default) + """ + rassert(calc_type in self._default_apps, + "Unrecognized calculation type", calc_type) + rassert(self._default_apps[calc_type] is None, + "Default {} app already set".format(calc_type)) + self._default_apps[calc_type] = Application(full_path, calc_type, desc) class BalsamRegister(Register): - '''Registers and stores user applications in libEnsemble and Balsam''' + """Registers and stores user applications in libEnsemble and Balsam""" @staticmethod def del_apps(): - """ Deletes all Balsam apps whose names contains substring .simfunc or .genfunc""" - import balsam.launcher.dag + """Deletes all Balsam apps whose names contains .simfunc or .genfunc""" from balsam.service import models AppDef = models.ApplicationDefinition @@ -123,8 +111,7 @@ def del_apps(): @staticmethod def del_jobs(): - """ Deletes all Balsam jobs whose names contains substring .simfunc or .genfunc""" - import balsam.launcher.dag + """Deletes all Balsam jobs whose names contains .simfunc or .genfunc""" from balsam.service import models Job = models.BalsamJob @@ -135,7 +122,8 @@ def del_jobs(): logger.debug("Deleting job {}".format(del_job.name)) deletion_objs.delete() - ##May be able to use union function - to combine - see queryset help. Eg (not tested) + ##May be able to use union function - to combine - see queryset help. + ##Eg (not tested) #del_simfuncs = Job.objects.filter(name__contains='.simfunc') #del_genfuncs = Job.objects.filter(name__contains='.genfunc') #deletion_objs = deletion_objs.union() @@ -143,7 +131,6 @@ def del_jobs(): @staticmethod def add_app(name, exepath, desc): """ Add application to Balsam database """ - import balsam.launcher.dag from balsam.service import models AppDef = models.ApplicationDefinition app = AppDef() @@ -155,34 +142,21 @@ def add_app(name, exepath, desc): app.save() logger.debug("Added App {}".format(app.name)) - def __init__(self, default=True): - '''Instantiate a new BalsamRegister instance - - Parameters - ---------- - - default: Boolean, optional - Make this the default_registry (Default is True) - - - Note: Currently, only a default registry is supported. - - ''' - - super().__init__(default) + def __init__(self): + """Instantiate a new BalsamRegister instance""" + super().__init__() #Check for empty database if poss #And/or compare with whats in database and only empty if I need to - - #Currently not deleting as will delete the top level job - ie. the one running. - + #Currently not deleting as will delete the top level job - + # ie. the one running. #Will put MPI_MODE in a settings module... if MPI.COMM_WORLD.Get_rank() == 0: BalsamRegister.del_apps() BalsamRegister.del_jobs() - def register_calc(self, full_path, calc_type='sim', desc=None, default=True): - '''Registers a user applications to libEnsemble and Balsam + def register_calc(self, full_path, calc_type='sim', desc=None): + """Registers a user applications to libEnsemble and Balsam Parameters ---------- @@ -191,25 +165,22 @@ def register_calc(self, full_path, calc_type='sim', desc=None, default=True): The full path of the user application to be registered. calc_type: String - Calculation type: Is this application part of a 'sim' or 'gen' function. + Calculation type: Is this application part of a 'sim' + or 'gen' function. desc: String, optional Description of this application. - default: Boolean, optional - Register to the default_registry (Default is True). - + """ + # OK to use Python 3 syntax (Balsam requires 3.6+) + super().register_calc(full_path, calc_type, desc) - ''' - super().register_calc(full_path, calc_type, desc, default) - #Req python 3 to exclude args - but as Balsam requires 3.6+ I may do - or is it only __init__() + rassert(calc_type in self._default_apps, + "Unrecognized calculation type", calc_type) #Get from one place - so always matches - if calc_type in self._default_apps: - calc_name = self._default_apps[calc_type].name - desc = self._default_apps[calc_type].desc - else: - raise RegistrationException("Unrecognized calculation type", calc_type) + calc_name = self._default_apps[calc_type].name + desc = self._default_apps[calc_type].desc if MPI.COMM_WORLD.Get_rank() == 0: self.add_app(calc_name, full_path, desc) From e11ea9b1a198d08ab89a63eeeb13b4c0b9c8119f Mon Sep 17 00:00:00 2001 From: David Bindel Date: Mon, 10 Sep 2018 11:32:15 -0500 Subject: [PATCH 052/101] Split out MPIJobController into a separate module (prep for Register merge). --- libensemble/balsam_controller.py | 13 +- libensemble/controller.py | 199 +---------------- libensemble/mpi_controller.py | 206 ++++++++++++++++++ libensemble/sim_funcs/job_control_hworld.py | 2 +- .../test_jobcontroller.manager_poll.py | 1 + .../controller_tests/test_jobcontroller.py | 1 + .../test_jobcontroller_multi.py | 3 +- .../tests/regression_tests/test_comms.py | 2 +- .../test_jobcontroller_hworld.py | 3 +- .../tests/unit_tests/test_job_funcs.py | 3 +- .../tests/unit_tests/test_jobcontroller.py | 3 +- 11 files changed, 226 insertions(+), 210 deletions(-) create mode 100644 libensemble/mpi_controller.py diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py index fc4639ed4..bec733c78 100644 --- a/libensemble/balsam_controller.py +++ b/libensemble/balsam_controller.py @@ -1,13 +1,5 @@ -#!/usr/bin/env python - """ -Module to launch and control running jobs. - -Contains job_controller, job, and inherited classes. A job_controller can -create and manage multiple jobs. The worker or user-side code can issue -and manage jobs using the launch, poll and kill functions. Job attributes -are queried to determine status. Functions are also provided to access -and interrogate files in the job's working directory. +Module to launch and control running jobs with Balsam. """ @@ -17,7 +9,8 @@ from libensemble.mpi_resources import MPIResources from libensemble.controller import \ - Job, MPIJobController, JobControllerException, jassert, STATES + Job, JobControllerException, jassert, STATES +from libensemble.mpi_controller import MPIJobController logger = logging.getLogger(__name__ + '(' + MPIResources.get_my_name() + ')') #For debug messages in this module - uncomment diff --git a/libensemble/controller.py b/libensemble/controller.py index 993a6fe16..eff7b50d5 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -1,11 +1,11 @@ """ Module to launch and control running jobs. -Contains job_controller, job, and inherited classes. A job_controller can -create and manage multiple jobs. The worker or user-side code can issue -and manage jobs using the launch, poll and kill functions. Job attributes -are queried to determine status. Functions are also provided to access -and interrogate files in the job's working directory. +Contains job_controller and job. Inherited classes include MPI and Balsam +variants. A job_controller can create and manage multiple jobs. The worker or +user-side code can issue and manage jobs using the launch, poll and kill +functions. Job attributes are queried to determine status. Functions are also +provided to access and interrogate files in the job's working directory. """ @@ -293,192 +293,3 @@ def kill(self, job): "Kill a job" jassert(isinstance(job, Job), "Invalid job has been provided") job.kill(self.wait_time) - - -class MPIJobController(JobController): - """The MPI job_controller can create, poll and kill runnable MPI jobs - """ - - def __init__(self, registry=None, auto_resources=True, - nodelist_env_slurm=None, nodelist_env_cobalt=None): - """Instantiate a new JobController instance. - - A new JobController object is created with an application - registry and configuration attributes. A registry object must - have been created. - - This is typically created in the user calling script. If - auto_resources is True, an evaluation of system resources is - performance during this call. - - Parameters - ---------- - registry: obj: Registry, optional - A registry containing the applications to use in this - job_controller (Default: Use Register.default_registry). - - auto_resources: Boolean, optional - Auto-detect available processor resources and assign to jobs - if not explicitly provided on launch. - - nodelist_env_slurm: String, optional - The environment variable giving a node list in Slurm format - (Default: Uses SLURM_NODELIST). Note: This is only queried if - a worker_list file is not provided and auto_resources=True. - - nodelist_env_cobalt: String, optional - The environment variable giving a node list in Cobalt format - (Default: Uses COBALT_PARTNAME) Note: This is only queried - if a worker_list file is not provided and - auto_resources=True. - """ - - JobController.__init__(self, registry) - self.auto_resources = auto_resources - if self.auto_resources: - self.resources = \ - MPIResources(top_level_dir=self.top_level_dir, - nodelist_env_slurm=nodelist_env_slurm, - nodelist_env_cobalt=nodelist_env_cobalt) - - mpi_commands = { - 'mpich': ['mpirun', '--env {env}', '-machinefile {machinefile}', - '-hosts {hostlist}', '-np {num_procs}', - '--ppn {ranks_per_node}'], - 'openmpi': ['mpirun', '-x {env}', '-machinefile {machinefile}', - '-host {hostlist}', '-np {num_procs}', - '-npernode {ranks_per_node}'], - } - self.mpi_command = mpi_commands[MPIResources.get_MPI_variant()] - - - def _get_mpi_specs(self, num_procs, num_nodes, ranks_per_node, - machinefile, hyperthreads): - "Form the mpi_specs dictionary." - hostlist = None - if machinefile is None and self.auto_resources: - - #kludging this for now - not nec machinefile if more than one node - #- try a hostlist - num_procs, num_nodes, ranks_per_node = \ - self.resources.get_resources( - num_procs=num_procs, - num_nodes=num_nodes, ranks_per_node=ranks_per_node, - hyperthreads=hyperthreads) - - if num_nodes > 1: - #hostlist - hostlist = self.resources.get_hostlist() - else: - #machinefile - machinefile = "machinefile_autogen" - if self.workerID is not None: - machinefile += "_for_worker_{}".format(self.workerID) - mfile_created, num_procs, num_nodes, ranks_per_node = \ - self.resources.create_machinefile( - machinefile, num_procs, num_nodes, - ranks_per_node, hyperthreads) - jassert(mfile_created, "Auto-creation of machinefile failed") - - else: - num_procs, num_nodes, ranks_per_node = \ - MPIResources.job_partition(num_procs, num_nodes, - ranks_per_node, machinefile) - - return {'num_procs': num_procs, - 'num_nodes': num_nodes, - 'ranks_per_node': ranks_per_node, - 'machinefile': machinefile, - 'hostlist': hostlist} - - - def launch(self, calc_type, num_procs=None, num_nodes=None, - ranks_per_node=None, machinefile=None, app_args=None, - stdout=None, stderr=None, stage_inout=None, - hyperthreads=False, test=False): - """Creates a new job, and either launches or schedules launch. - - The created job object is returned. - - Parameters - ---------- - - calc_type: String - The calculation type: 'sim' or 'gen' - - num_procs: int, optional - The total number of MPI tasks on which to launch the job. - - num_nodes: int, optional - The number of nodes on which to launch the job. - - ranks_per_node: int, optional - The ranks per node for this job. - - machinefile: string, optional - Name of a machinefile for this job to use. - - app_args: string, optional - A string of the application arguments to be added to job - launch command line. - - stdout: string, optional - A standard output filename. - - stderr: string, optional - A standard error filename. - - stage_inout: string, optional - A directory to copy files from. Default will take from - current directory. - - hyperthreads: boolean, optional - Whether to launch MPI tasks to hyperthreads - - test: boolean, optional - Whether this is a test - No job will be launched. Instead - runline is printed to logger (At INFO level). - - - Returns - ------- - - job: obj: Job - The lauched job object. - - - Note that if some combination of num_procs, num_nodes and - ranks_per_node are provided, these will be honored if - possible. If resource detection is on and these are omitted, - then the available resources will be divided amongst workers. - """ - - app = self.default_app(calc_type) - default_workdir = os.getcwd() - job = Job(app, app_args, default_workdir, stdout, stderr, self.workerID) - - if stage_inout is not None: - logger.warning("stage_inout option ignored in this " - "job_controller - runs in-place") - - mpi_specs = self._get_mpi_specs(num_procs, num_nodes, ranks_per_node, - machinefile, hyperthreads) - runline = launcher.form_command(self.mpi_command, mpi_specs) - runline.append(job.app.full_path) - if job.app_args is not None: - runline.extend(job.app_args.split()) - - if test: - logger.info('Test selected: Not launching job') - logger.info('runline args are {}'.format(runline)) - else: - logger.debug("Launching job {}: {}". - format(job.name, " ".join(runline))) #One line - job.launch_time = time.time() - job.process = launcher.launch(runline, cwd='./', - stdout=open(job.stdout, 'w'), - stderr=open(job.stderr, 'w'), - start_new_session=True) - self.list_of_jobs.append(job) - - return job diff --git a/libensemble/mpi_controller.py b/libensemble/mpi_controller.py new file mode 100644 index 000000000..653bb8918 --- /dev/null +++ b/libensemble/mpi_controller.py @@ -0,0 +1,206 @@ +""" +Module to launch and control running MPI jobs. + +""" + +import os +import logging +import time + +import libensemble.launcher as launcher +from libensemble.mpi_resources import MPIResources +from libensemble.controller import JobController, Job, jassert + +logger = logging.getLogger(__name__ + '(' + MPIResources.get_my_name() + ')') +#For debug messages in this module - uncomment +#(see libE.py to change root logging level) +#logger.setLevel(logging.DEBUG) + + +class MPIJobController(JobController): + """The MPI job_controller can create, poll and kill runnable MPI jobs + """ + + def __init__(self, registry=None, auto_resources=True, + nodelist_env_slurm=None, nodelist_env_cobalt=None): + """Instantiate a new JobController instance. + + A new JobController object is created with an application + registry and configuration attributes. A registry object must + have been created. + + This is typically created in the user calling script. If + auto_resources is True, an evaluation of system resources is + performance during this call. + + Parameters + ---------- + registry: obj: Registry, optional + A registry containing the applications to use in this + job_controller (Default: Use Register.default_registry). + + auto_resources: Boolean, optional + Auto-detect available processor resources and assign to jobs + if not explicitly provided on launch. + + nodelist_env_slurm: String, optional + The environment variable giving a node list in Slurm format + (Default: Uses SLURM_NODELIST). Note: This is only queried if + a worker_list file is not provided and auto_resources=True. + + nodelist_env_cobalt: String, optional + The environment variable giving a node list in Cobalt format + (Default: Uses COBALT_PARTNAME) Note: This is only queried + if a worker_list file is not provided and + auto_resources=True. + """ + + JobController.__init__(self, registry) + self.auto_resources = auto_resources + if self.auto_resources: + self.resources = \ + MPIResources(top_level_dir=self.top_level_dir, + nodelist_env_slurm=nodelist_env_slurm, + nodelist_env_cobalt=nodelist_env_cobalt) + + mpi_commands = { + 'mpich': ['mpirun', '--env {env}', '-machinefile {machinefile}', + '-hosts {hostlist}', '-np {num_procs}', + '--ppn {ranks_per_node}'], + 'openmpi': ['mpirun', '-x {env}', '-machinefile {machinefile}', + '-host {hostlist}', '-np {num_procs}', + '-npernode {ranks_per_node}'], + } + self.mpi_command = mpi_commands[MPIResources.get_MPI_variant()] + + + def _get_mpi_specs(self, num_procs, num_nodes, ranks_per_node, + machinefile, hyperthreads): + "Form the mpi_specs dictionary." + hostlist = None + if machinefile is None and self.auto_resources: + + #kludging this for now - not nec machinefile if more than one node + #- try a hostlist + num_procs, num_nodes, ranks_per_node = \ + self.resources.get_resources( + num_procs=num_procs, + num_nodes=num_nodes, ranks_per_node=ranks_per_node, + hyperthreads=hyperthreads) + + if num_nodes > 1: + #hostlist + hostlist = self.resources.get_hostlist() + else: + #machinefile + machinefile = "machinefile_autogen" + if self.workerID is not None: + machinefile += "_for_worker_{}".format(self.workerID) + mfile_created, num_procs, num_nodes, ranks_per_node = \ + self.resources.create_machinefile( + machinefile, num_procs, num_nodes, + ranks_per_node, hyperthreads) + jassert(mfile_created, "Auto-creation of machinefile failed") + + else: + num_procs, num_nodes, ranks_per_node = \ + MPIResources.job_partition(num_procs, num_nodes, + ranks_per_node, machinefile) + + return {'num_procs': num_procs, + 'num_nodes': num_nodes, + 'ranks_per_node': ranks_per_node, + 'machinefile': machinefile, + 'hostlist': hostlist} + + + def launch(self, calc_type, num_procs=None, num_nodes=None, + ranks_per_node=None, machinefile=None, app_args=None, + stdout=None, stderr=None, stage_inout=None, + hyperthreads=False, test=False): + """Creates a new job, and either launches or schedules launch. + + The created job object is returned. + + Parameters + ---------- + + calc_type: String + The calculation type: 'sim' or 'gen' + + num_procs: int, optional + The total number of MPI tasks on which to launch the job. + + num_nodes: int, optional + The number of nodes on which to launch the job. + + ranks_per_node: int, optional + The ranks per node for this job. + + machinefile: string, optional + Name of a machinefile for this job to use. + + app_args: string, optional + A string of the application arguments to be added to job + launch command line. + + stdout: string, optional + A standard output filename. + + stderr: string, optional + A standard error filename. + + stage_inout: string, optional + A directory to copy files from. Default will take from + current directory. + + hyperthreads: boolean, optional + Whether to launch MPI tasks to hyperthreads + + test: boolean, optional + Whether this is a test - No job will be launched. Instead + runline is printed to logger (At INFO level). + + + Returns + ------- + + job: obj: Job + The lauched job object. + + + Note that if some combination of num_procs, num_nodes and + ranks_per_node are provided, these will be honored if + possible. If resource detection is on and these are omitted, + then the available resources will be divided amongst workers. + """ + + app = self.default_app(calc_type) + default_workdir = os.getcwd() + job = Job(app, app_args, default_workdir, stdout, stderr, self.workerID) + + if stage_inout is not None: + logger.warning("stage_inout option ignored in this " + "job_controller - runs in-place") + + mpi_specs = self._get_mpi_specs(num_procs, num_nodes, ranks_per_node, + machinefile, hyperthreads) + runline = launcher.form_command(self.mpi_command, mpi_specs) + runline.append(job.app.full_path) + if job.app_args is not None: + runline.extend(job.app_args.split()) + + if test: + logger.info('Test selected: Not launching job') + logger.info('runline args are {}'.format(runline)) + else: + logger.debug("Launching job {}: {}". + format(job.name, " ".join(runline))) #One line + job.launch_time = time.time() + job.process = launcher.launch(runline, cwd='./', + stdout=open(job.stdout, 'w'), + stderr=open(job.stderr, 'w'), + start_new_session=True) + self.list_of_jobs.append(job) + + return job diff --git a/libensemble/sim_funcs/job_control_hworld.py b/libensemble/sim_funcs/job_control_hworld.py index d082d1dd8..3a785f33d 100644 --- a/libensemble/sim_funcs/job_control_hworld.py +++ b/libensemble/sim_funcs/job_control_hworld.py @@ -1,4 +1,4 @@ -from libensemble.controller import MPIJobController +from libensemble.mpi_controller import MPIJobController from libensemble.message_numbers import * import numpy as np diff --git a/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py b/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py index 9b7ec8dd5..2d57d6986 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py +++ b/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py @@ -25,6 +25,7 @@ def build_simfunc(): from libensemble.register import * from libensemble.controller import * +from libensemble.mpi_controller import * from libensemble.balsam_controller import * #sim_app = 'simdir/my_simjob.x' diff --git a/libensemble/tests/controller_tests/test_jobcontroller.py b/libensemble/tests/controller_tests/test_jobcontroller.py index b069fe404..374eac8e2 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller.py +++ b/libensemble/tests/controller_tests/test_jobcontroller.py @@ -16,6 +16,7 @@ def build_simfunc(): from libensemble.register import * from libensemble.controller import * +from libensemble.mpi_controller import * from libensemble.balsam_controller import * #sim_app = 'simdir/my_simjob.x' diff --git a/libensemble/tests/controller_tests/test_jobcontroller_multi.py b/libensemble/tests/controller_tests/test_jobcontroller_multi.py index 40a806390..d9922079a 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller_multi.py +++ b/libensemble/tests/controller_tests/test_jobcontroller_multi.py @@ -18,7 +18,8 @@ def build_simfunc(): #--------------- Calling script --------------------------------------------------------------- from libensemble.register import Register, BalsamRegister -from libensemble.controller import JobController, MPIJobController +from libensemble.controller import JobController +from libensemble.mpi_controller import MPIJobController from libensemble.baslam_controller import BalsamJobController #sim_app = 'simdir/my_simjob.x' diff --git a/libensemble/tests/regression_tests/test_comms.py b/libensemble/tests/regression_tests/test_comms.py index 2c881c4d8..299cd8b90 100644 --- a/libensemble/tests/regression_tests/test_comms.py +++ b/libensemble/tests/regression_tests/test_comms.py @@ -34,7 +34,7 @@ from libensemble.sim_funcs.comms_testing import float_x1000 from libensemble.gen_funcs.uniform_sampling import uniform_random_sample from libensemble.register import Register #Only being used to pass workerID -from libensemble.controller import JobController, MPIJobController #Only being used to pass workerID +from libensemble.mpi_controller import MPIJobController #Only being used to pass workerID from libensemble.resources import Resources #Only to get number of workers registry = Register() diff --git a/libensemble/tests/regression_tests/test_jobcontroller_hworld.py b/libensemble/tests/regression_tests/test_jobcontroller_hworld.py index f58291acc..e3f0a51b2 100644 --- a/libensemble/tests/regression_tests/test_jobcontroller_hworld.py +++ b/libensemble/tests/regression_tests/test_jobcontroller_hworld.py @@ -10,7 +10,8 @@ from libensemble.sim_funcs.job_control_hworld import job_control_hworld from libensemble.gen_funcs.uniform_sampling import uniform_random_sample from libensemble.register import Register, BalsamRegister -from libensemble.controller import JobController, MPIJobController +from libensemble.controller import JobController +from libensemble.mpi_controller import MPIJobController from libensemble.balsam_controller import BalsamJobController from libensemble.calc_info import CalcInfo from libensemble.resources import Resources diff --git a/libensemble/tests/unit_tests/test_job_funcs.py b/libensemble/tests/unit_tests/test_job_funcs.py index 3b981b006..17f69d836 100644 --- a/libensemble/tests/unit_tests/test_job_funcs.py +++ b/libensemble/tests/unit_tests/test_job_funcs.py @@ -2,7 +2,8 @@ import shutil from libensemble.register import Register -from libensemble.controller import Job, JobController, MPIJobController, JobControllerException +from libensemble.controller import Job, JobController, JobControllerException +from libensemble.mpi_controller import MPIJobController def setup_module(module): print ("setup_module module:%s" % module.__name__) diff --git a/libensemble/tests/unit_tests/test_jobcontroller.py b/libensemble/tests/unit_tests/test_jobcontroller.py index 229b9fe22..a72005df3 100644 --- a/libensemble/tests/unit_tests/test_jobcontroller.py +++ b/libensemble/tests/unit_tests/test_jobcontroller.py @@ -7,7 +7,8 @@ import pytest import socket from libensemble.register import Register, BalsamRegister -from libensemble.controller import JobController, MPIJobController +from libensemble.controller import JobController +from libensemble.mpi_controller import MPIJobController from libensemble.balsam_controller import BalsamJobController USE_BALSAM = False From f60965fe1c3123dffdb401656df1c7e207d415fd Mon Sep 17 00:00:00 2001 From: David Bindel Date: Mon, 10 Sep 2018 11:37:35 -0500 Subject: [PATCH 053/101] Base controller back to using Resources (vs MPIResources) for name --- libensemble/controller.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libensemble/controller.py b/libensemble/controller.py index eff7b50d5..ad64b723e 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -16,9 +16,9 @@ import libensemble.launcher as launcher from libensemble.register import Register -from libensemble.mpi_resources import MPIResources +from libensemble.resources import Resources -logger = logging.getLogger(__name__ + '(' + MPIResources.get_my_name() + ')') +logger = logging.getLogger(__name__ + '(' + Resources.get_my_name() + ')') #For debug messages in this module - uncomment #(see libE.py to change root logging level) #logger.setLevel(logging.DEBUG) From 9142db8417632bc9debab341d5c56da730b7fe0e Mon Sep 17 00:00:00 2001 From: David Bindel Date: Mon, 10 Sep 2018 11:55:06 -0500 Subject: [PATCH 054/101] Redundant assertion in Balsam register_calc (already handled by base class call) --- libensemble/register.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/libensemble/register.py b/libensemble/register.py index 461b668cc..3f64453d4 100644 --- a/libensemble/register.py +++ b/libensemble/register.py @@ -154,7 +154,6 @@ def __init__(self): BalsamRegister.del_apps() BalsamRegister.del_jobs() - def register_calc(self, full_path, calc_type='sim', desc=None): """Registers a user applications to libEnsemble and Balsam @@ -175,9 +174,6 @@ def register_calc(self, full_path, calc_type='sim', desc=None): # OK to use Python 3 syntax (Balsam requires 3.6+) super().register_calc(full_path, calc_type, desc) - rassert(calc_type in self._default_apps, - "Unrecognized calculation type", calc_type) - #Get from one place - so always matches calc_name = self._default_apps[calc_type].name desc = self._default_apps[calc_type].desc From 56075e538b24ae85d544dae3102e30b26e2cf7ab Mon Sep 17 00:00:00 2001 From: David Bindel Date: Mon, 10 Sep 2018 12:39:01 -0500 Subject: [PATCH 055/101] Merged registry functions into job controllers. Changed tests to only import balsam_controller on USE_BALSAM. (hence, imports for Balsam can now go at the top of balsam_controller). --- libensemble/balsam_controller.py | 87 ++++++++- libensemble/controller.py | 68 +++++-- libensemble/mpi_controller.py | 8 +- libensemble/register.py | 182 ------------------ .../test_jobcontroller.manager_poll.py | 13 +- .../controller_tests/test_jobcontroller.py | 13 +- .../test_jobcontroller_multi.py | 13 +- .../tests/regression_tests/test_comms.py | 6 +- .../test_jobcontroller_hworld.py | 13 +- .../tests/unit_tests/test_job_funcs.py | 23 +-- .../tests/unit_tests/test_jobcontroller.py | 63 ++---- 11 files changed, 173 insertions(+), 316 deletions(-) delete mode 100644 libensemble/register.py diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py index bec733c78..5649abb65 100644 --- a/libensemble/balsam_controller.py +++ b/libensemble/balsam_controller.py @@ -6,12 +6,16 @@ import os import logging import time +from mpi4py import MPI from libensemble.mpi_resources import MPIResources from libensemble.controller import \ Job, JobControllerException, jassert, STATES from libensemble.mpi_controller import MPIJobController +import balsam.launcher.dag as dag +from balsam.service import models + logger = logging.getLogger(__name__ + '(' + MPIResources.get_my_name() + ')') #For debug messages in this module - uncomment #(see libE.py to change root logging level) @@ -67,10 +71,6 @@ def poll(self): self.process.refresh_from_db() balsam_state = self.process.state - #Might need this before get models - test - import balsam.launcher.dag as dag - from balsam.service import models - if balsam_state in models.END_STATES: self.finished = True self.calc_job_timing() @@ -106,7 +106,6 @@ def poll(self): def kill(self, wait_time=None): """ Kills or cancels the supplied job """ - import balsam.launcher.dag as dag dag.kill(self.process) #Could have Wait here and check with Balsam its killed - @@ -123,17 +122,89 @@ class BalsamJobController(MPIJobController): .. note:: Job kills are not configurable in the Balsam job_controller. """ - def __init__(self, registry=None, auto_resources=True, + def __init__(self, auto_resources=True, nodelist_env_slurm=None, nodelist_env_cobalt=None): """Instantiate a new BalsamJobController instance. A new BalsamJobController object is created with an application registry and configuration attributes """ - super().__init__(registry, auto_resources, + super().__init__(auto_resources, nodelist_env_slurm, nodelist_env_cobalt) self.mpi_launcher = None + if MPI.COMM_WORLD.Get_rank() == 0: + BalsamJobController.del_apps() + BalsamJobController.del_jobs() + + @staticmethod + def del_apps(): + """Deletes all Balsam apps whose names contains .simfunc or .genfunc""" + AppDef = models.ApplicationDefinition + + #Some error handling on deletes.... is it internal + for app_type in ['.simfunc', '.genfunc']: + deletion_objs = AppDef.objects.filter(name__contains=app_type) + if deletion_objs: + for del_app in deletion_objs.iterator(): + logger.debug("Deleting app {}".format(del_app.name)) + deletion_objs.delete() + + @staticmethod + def del_jobs(): + """Deletes all Balsam jobs whose names contains .simfunc or .genfunc""" + for app_type in ['.simfunc', '.genfunc']: + deletion_objs = models.BalsamJob.objects.filter( + name__contains=app_type) + if deletion_objs: + for del_job in deletion_objs.iterator(): + logger.debug("Deleting job {}".format(del_job.name)) + deletion_objs.delete() + + ##May be able to use union function - to combine - see queryset help. + ##Eg (not tested) + #del_simfuncs = Job.objects.filter(name__contains='.simfunc') + #del_genfuncs = Job.objects.filter(name__contains='.genfunc') + #deletion_objs = deletion_objs.union() + + @staticmethod + def add_app(name, exepath, desc): + """ Add application to Balsam database """ + AppDef = models.ApplicationDefinition + app = AppDef() + app.name = name + app.executable = exepath + app.description = desc + #app.default_preprocess = '' # optional + #app.default_postprocess = '' # optional + app.save() + logger.debug("Added App {}".format(app.name)) + + def register_calc(self, full_path, calc_type='sim', desc=None): + """Registers a user applications to libEnsemble and Balsam + + Parameters + ---------- + + full_path: String + The full path of the user application to be registered. + + calc_type: String + Calculation type: Is this application part of a 'sim' + or 'gen' function. + + desc: String, optional + Description of this application. + + """ + # OK to use Python 3 syntax (Balsam requires 3.6+) + super().register_calc(full_path, calc_type, desc) + + #Get from one place - so always matches + calc_name = self.default_apps[calc_type].name + desc = self.default_apps[calc_type].desc + if MPI.COMM_WORLD.Get_rank() == 0: + self.add_app(calc_name, full_path, desc) def launch(self, calc_type, num_procs=None, num_nodes=None, ranks_per_node=None, machinefile=None, app_args=None, @@ -144,8 +215,6 @@ def launch(self, calc_type, num_procs=None, num_nodes=None, The created job object is returned. """ - import balsam.launcher.dag as dag - app = self.default_app(calc_type) #Need test somewhere for if no breakdown supplied.... diff --git a/libensemble/controller.py b/libensemble/controller.py index ad64b723e..ad9a30a1a 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -15,7 +15,6 @@ import time import libensemble.launcher as launcher -from libensemble.register import Register from libensemble.resources import Resources logger = logging.getLogger(__name__ + '(' + Resources.get_my_name() + ')') @@ -43,6 +42,21 @@ def jassert(test, *args): raise JobControllerException(*args) +class Application: + """An application is an executable user-program + (e.g. Implementing a sim/gen function).""" + + def __init__(self, full_path, calc_type='sim', desc=None): + """Instantiate a new Application instance.""" + self.full_path = full_path + self.calc_type = calc_type + self.calc_dir, self.exe = os.path.split(full_path) + + # Use this name to delete jobs in database - see del_apps(), del_jobs() + self.name = self.exe + '.' + self.calc_type + 'func' + self.desc = desc or (self.exe + ' ' + self.calc_type + ' function') + + class Job: """ Manage the creation, configuration and status of a launchable job. @@ -205,7 +219,6 @@ class JobController: **Object Attributes:** - :ivar Register registry: The registry associated with this job_controller :ivar int wait_time: Timeout period for hard kill :ivar list list_of_jobs: A list of jobs created in this job controller :ivar int workerID: The workerID associated with this job controller @@ -214,43 +227,66 @@ class JobController: controller = None - def __init__(self, registry=None): + def __init__(self): """Instantiate a new JobController instance. A new JobController object is created with an application - registry and configuration attributes. A registry object must - have been created. + registry and configuration attributes. This is typically created in the user calling script. If auto_resources is True, an evaluation of system resources is performance during this call. - - Parameters - ---------- - registry: obj: Registry, optional - A registry containing the applications to use in this - job_controller (Default: Use Register.default_registry). """ - - self.registry = registry or Register.default_registry - jassert(self.registry is not None, "Cannot find default registry") - self.top_level_dir = os.getcwd() self.manager_signal = 'none' + self.default_apps = {'sim' : None, 'gen': None} self.wait_time = 60 self.list_of_jobs = [] self.workerID = None JobController.controller = self + @property + def sim_default_app(self): + """Return the default simulation app.""" + return self.default_apps['sim'] + + @property + def gen_default_app(self): + """Return the default generator app.""" + return self.default_apps['gen'] + def default_app(self, calc_type): "Get the default app for a given calc type." - app = self.registry.default_app(calc_type) + app = self.default_apps.get(calc_type) jassert(calc_type in ['sim', 'gen'], "Unrecognized calculation type", calc_type) jassert(app, "Default {} app is not set".format(calc_type)) return app + def register_calc(self, full_path, calc_type='sim', desc=None): + """Registers a user application to libEnsemble + + Parameters + ---------- + + full_path: String + The full path of the user application to be registered. + + calc_type: String + Calculation type: Is this application part of a 'sim' + or 'gen' function. + + desc: String, optional + Description of this application. + + """ + jassert(calc_type in self.default_apps, + "Unrecognized calculation type", calc_type) + jassert(self.default_apps[calc_type] is None, + "Default {} app already set".format(calc_type)) + self.default_apps[calc_type] = Application(full_path, calc_type, desc) + def manager_poll(self): """ Polls for a manager signal diff --git a/libensemble/mpi_controller.py b/libensemble/mpi_controller.py index 653bb8918..03b338521 100644 --- a/libensemble/mpi_controller.py +++ b/libensemble/mpi_controller.py @@ -21,7 +21,7 @@ class MPIJobController(JobController): """The MPI job_controller can create, poll and kill runnable MPI jobs """ - def __init__(self, registry=None, auto_resources=True, + def __init__(self, auto_resources=True, nodelist_env_slurm=None, nodelist_env_cobalt=None): """Instantiate a new JobController instance. @@ -35,10 +35,6 @@ def __init__(self, registry=None, auto_resources=True, Parameters ---------- - registry: obj: Registry, optional - A registry containing the applications to use in this - job_controller (Default: Use Register.default_registry). - auto_resources: Boolean, optional Auto-detect available processor resources and assign to jobs if not explicitly provided on launch. @@ -55,7 +51,7 @@ def __init__(self, registry=None, auto_resources=True, auto_resources=True. """ - JobController.__init__(self, registry) + JobController.__init__(self) self.auto_resources = auto_resources if self.auto_resources: self.resources = \ diff --git a/libensemble/register.py b/libensemble/register.py deleted file mode 100644 index 3f64453d4..000000000 --- a/libensemble/register.py +++ /dev/null @@ -1,182 +0,0 @@ -#!/usr/bin/env python - -"""Module to register applications to libEnsemble""" - -import os -import logging -from mpi4py import MPI - -logger = logging.getLogger(__name__) -#For debug messages in this module - uncomment -#(see libE.py to change root logging level) -#logger.setLevel(logging.DEBUG) - -class RegistrationException(Exception): - "Raised for any exception in the Register" - pass - -def rassert(check, *args): - "Version of assert that raises a RegistrationException" - if not check: - raise RegistrationException(*args) - - -class Application: - """An application is an executable user-program - (e.g. Implementing a sim/gen function).""" - - def __init__(self, full_path, calc_type='sim', desc=None): - """Instantiate a new Application instance.""" - self.full_path = full_path - self.calc_type = calc_type - self.calc_dir, self.exe = os.path.split(full_path) - - # Use this name to delete jobs in database - see del_apps(), del_jobs() - self.name = self.exe + '.' + self.calc_type + 'func' - self.desc = desc or (self.exe + ' ' + self.calc_type + ' function') - -#May merge this into job_controller -class Register(): - """Registers and stores user applications - - Attributes - ---------- - default_registry : Obj: Register or inherited class. - A class attribute holding the default registry. - - """ - - default_registry = None - - @property - def sim_default_app(self): - """Return the default simulation app.""" - return self._default_apps['sim'] - - @property - def gen_default_app(self): - """Return the default generator app.""" - return self._default_apps['gen'] - - def default_app(self, calc_type): - """Return the default calc_type app.""" - return self._default_apps.get(calc_type) - - def __init__(self): - """Instantiate a new Register instance.""" - self._default_apps = {'sim' : None, 'gen': None} - Register.default_registry = self - - def register_calc(self, full_path, calc_type='sim', desc=None): - """Registers a user application to libEnsemble - - Parameters - ---------- - - full_path: String - The full path of the user application to be registered. - - calc_type: String - Calculation type: Is this application part of a 'sim' - or 'gen' function. - - desc: String, optional - Description of this application. - - """ - rassert(calc_type in self._default_apps, - "Unrecognized calculation type", calc_type) - rassert(self._default_apps[calc_type] is None, - "Default {} app already set".format(calc_type)) - self._default_apps[calc_type] = Application(full_path, calc_type, desc) - - -class BalsamRegister(Register): - - """Registers and stores user applications in libEnsemble and Balsam""" - - @staticmethod - def del_apps(): - """Deletes all Balsam apps whose names contains .simfunc or .genfunc""" - from balsam.service import models - AppDef = models.ApplicationDefinition - - #Some error handling on deletes.... is it internal - for app_type in ['.simfunc', '.genfunc']: - deletion_objs = AppDef.objects.filter(name__contains=app_type) - if deletion_objs: - for del_app in deletion_objs.iterator(): - logger.debug("Deleting app {}".format(del_app.name)) - deletion_objs.delete() - - @staticmethod - def del_jobs(): - """Deletes all Balsam jobs whose names contains .simfunc or .genfunc""" - from balsam.service import models - Job = models.BalsamJob - - for app_type in ['.simfunc', '.genfunc']: - deletion_objs = Job.objects.filter(name__contains=app_type) - if deletion_objs: - for del_job in deletion_objs.iterator(): - logger.debug("Deleting job {}".format(del_job.name)) - deletion_objs.delete() - - ##May be able to use union function - to combine - see queryset help. - ##Eg (not tested) - #del_simfuncs = Job.objects.filter(name__contains='.simfunc') - #del_genfuncs = Job.objects.filter(name__contains='.genfunc') - #deletion_objs = deletion_objs.union() - - @staticmethod - def add_app(name, exepath, desc): - """ Add application to Balsam database """ - from balsam.service import models - AppDef = models.ApplicationDefinition - app = AppDef() - app.name = name - app.executable = exepath - app.description = desc - #app.default_preprocess = '' # optional - #app.default_postprocess = '' # optional - app.save() - logger.debug("Added App {}".format(app.name)) - - def __init__(self): - """Instantiate a new BalsamRegister instance""" - super().__init__() - #Check for empty database if poss - #And/or compare with whats in database and only empty if I need to - #Currently not deleting as will delete the top level job - - # ie. the one running. - #Will put MPI_MODE in a settings module... - if MPI.COMM_WORLD.Get_rank() == 0: - BalsamRegister.del_apps() - BalsamRegister.del_jobs() - - def register_calc(self, full_path, calc_type='sim', desc=None): - """Registers a user applications to libEnsemble and Balsam - - Parameters - ---------- - - full_path: String - The full path of the user application to be registered. - - calc_type: String - Calculation type: Is this application part of a 'sim' - or 'gen' function. - - desc: String, optional - Description of this application. - - """ - # OK to use Python 3 syntax (Balsam requires 3.6+) - super().register_calc(full_path, calc_type, desc) - - #Get from one place - so always matches - calc_name = self._default_apps[calc_type].name - desc = self._default_apps[calc_type].desc - - if MPI.COMM_WORLD.Get_rank() == 0: - self.add_app(calc_name, full_path, desc) diff --git a/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py b/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py index 2d57d6986..f1213ef83 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py +++ b/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py @@ -23,10 +23,7 @@ def build_simfunc(): #--------------- Calling script ------------------------------------------ -from libensemble.register import * from libensemble.controller import * -from libensemble.mpi_controller import * -from libensemble.balsam_controller import * #sim_app = 'simdir/my_simjob.x' #gen_app = 'gendir/my_genjob.x' @@ -42,13 +39,13 @@ def build_simfunc(): #Create and add exes to registry if USE_BALSAM: - registry = BalsamRegister() - jobctrl = BalsamJobController(registry = registry) + from libensemble.balsam_controller import * + jobctrl = BalsamJobController() else: - registry = Register() - jobctrl = MPIJobController(registry = registry) + from libensemble.mpi_controller import * + jobctrl = MPIJobController() -registry.register_calc(full_path=sim_app, calc_type='sim') +jobctl.register_calc(full_path=sim_app, calc_type='sim') #Alternative to IF could be using eg. fstring to specify: e.g: #JOB_CONTROLLER = 'Balsam' diff --git a/libensemble/tests/controller_tests/test_jobcontroller.py b/libensemble/tests/controller_tests/test_jobcontroller.py index 374eac8e2..605e6da7e 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller.py +++ b/libensemble/tests/controller_tests/test_jobcontroller.py @@ -14,10 +14,7 @@ def build_simfunc(): #--------------- Calling script ------------------------------------------ -from libensemble.register import * from libensemble.controller import * -from libensemble.mpi_controller import * -from libensemble.balsam_controller import * #sim_app = 'simdir/my_simjob.x' #gen_app = 'gendir/my_genjob.x' @@ -33,13 +30,13 @@ def build_simfunc(): #Create and add exes to registry if USE_BALSAM: - registry = BalsamRegister() - jobctrl = BalsamJobController(registry = registry) + from libensemble.balsam_controller import * + jobctrl = BalsamJobController() else: - registry = Register() - jobctrl = MPIJobController(registry = registry) + from libensemble.mpi_controller import * + jobctrl = MPIJobController() -registry.register_calc(full_path=sim_app, calc_type='sim') +jobctl.register_calc(full_path=sim_app, calc_type='sim') #Alternative to IF could be using eg. fstring to specify: e.g: #JOB_CONTROLLER = 'Balsam' diff --git a/libensemble/tests/controller_tests/test_jobcontroller_multi.py b/libensemble/tests/controller_tests/test_jobcontroller_multi.py index d9922079a..bec0d4f10 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller_multi.py +++ b/libensemble/tests/controller_tests/test_jobcontroller_multi.py @@ -17,10 +17,7 @@ def build_simfunc(): #--------------- Calling script --------------------------------------------------------------- -from libensemble.register import Register, BalsamRegister from libensemble.controller import JobController -from libensemble.mpi_controller import MPIJobController -from libensemble.baslam_controller import BalsamJobController #sim_app = 'simdir/my_simjob.x' #gen_app = 'gendir/my_genjob.x' @@ -36,13 +33,13 @@ def build_simfunc(): #Create and add exes to registry if USE_BALSAM: - registry = BalsamRegister() - jobctrl = BalsamJobController(registry = registry) + from libensemble.baslam_controller import BalsamJobController + jobctrl = BalsamJobController() else: - registry = Register() - jobctrl = MPIJobController(registry = registry) + from libensemble.mpi_controller import MPIJobController + jobctrl = MPIJobController() -registry.register_calc(full_path=sim_app, calc_type='sim') +jobctrl.register_calc(full_path=sim_app, calc_type='sim') #Alternative to IF could be using eg. fstring to specify: e.g: #JOB_CONTROLLER = 'Balsam' diff --git a/libensemble/tests/regression_tests/test_comms.py b/libensemble/tests/regression_tests/test_comms.py index 299cd8b90..675a4c3c6 100644 --- a/libensemble/tests/regression_tests/test_comms.py +++ b/libensemble/tests/regression_tests/test_comms.py @@ -33,13 +33,11 @@ from libensemble.libE import libE from libensemble.sim_funcs.comms_testing import float_x1000 from libensemble.gen_funcs.uniform_sampling import uniform_random_sample -from libensemble.register import Register #Only being used to pass workerID from libensemble.mpi_controller import MPIJobController #Only being used to pass workerID from libensemble.resources import Resources #Only to get number of workers -registry = Register() -jobctrl = MPIJobController(registry = registry, auto_resources = False) -#registry.register_calc(full_path=sim_app, calc_type='sim') #Test with no app registered. +jobctrl = MPIJobController(auto_resources = False) +#jobctrl.register_calc(full_path=sim_app, calc_type='sim') #Test with no app registered. num_workers = Resources.get_num_workers() array_size = int(1e6) # Size of large array in sim_specs diff --git a/libensemble/tests/regression_tests/test_jobcontroller_hworld.py b/libensemble/tests/regression_tests/test_jobcontroller_hworld.py index e3f0a51b2..16425b85f 100644 --- a/libensemble/tests/regression_tests/test_jobcontroller_hworld.py +++ b/libensemble/tests/regression_tests/test_jobcontroller_hworld.py @@ -9,10 +9,7 @@ from libensemble.libE import libE from libensemble.sim_funcs.job_control_hworld import job_control_hworld from libensemble.gen_funcs.uniform_sampling import uniform_random_sample -from libensemble.register import Register, BalsamRegister from libensemble.controller import JobController -from libensemble.mpi_controller import MPIJobController -from libensemble.balsam_controller import BalsamJobController from libensemble.calc_info import CalcInfo from libensemble.resources import Resources from libensemble.message_numbers import * @@ -37,12 +34,12 @@ def build_simfunc(): build_simfunc() if USE_BALSAM: - registry = BalsamRegister() - jobctrl = BalsamJobController(registry = registry, auto_resources = True) + from libensemble.balsam_controller import BalsamJobController + jobctrl = BalsamJobController(auto_resources = True) else: - registry = Register() - jobctrl = MPIJobController(registry = registry, auto_resources = True) -registry.register_calc(full_path=sim_app, calc_type='sim') + from libensemble.mpi_controller import MPIJobController + jobctrl = MPIJobController(auto_resources = True) +jobctrl.register_calc(full_path=sim_app, calc_type='sim') summary_file_name = short_name + '.libe_summary.txt' CalcInfo.set_statfile_name(summary_file_name) diff --git a/libensemble/tests/unit_tests/test_job_funcs.py b/libensemble/tests/unit_tests/test_job_funcs.py index 17f69d836..e24adae2a 100644 --- a/libensemble/tests/unit_tests/test_job_funcs.py +++ b/libensemble/tests/unit_tests/test_job_funcs.py @@ -1,7 +1,6 @@ import os import shutil -from libensemble.register import Register from libensemble.controller import Job, JobController, JobControllerException from libensemble.mpi_controller import MPIJobController @@ -11,10 +10,6 @@ def setup_module(module): ctrl = JobController.controller del ctrl JobController.controller = None - if Register.default_registry: - defreg = Register.default_registry - del defreg - Register.default_registry = None def setup_function(function): print ("setup_function function:%s" % function.__name__) @@ -22,10 +17,6 @@ def setup_function(function): ctrl = JobController.controller del ctrl JobController.controller = None - if Register.default_registry: - defreg = Register.default_registry - del defreg - Register.default_registry = None def teardown_module(module): print ("teardown_module module:%s" % module.__name__) @@ -33,18 +24,12 @@ def teardown_module(module): ctrl = JobController.controller del ctrl JobController.controller = None - if Register.default_registry: - defreg = Register.default_registry - del defreg - Register.default_registry = None - def test_job_funcs(): dummyappname = os.getcwd() + '/myapp.x' - registry = Register() - jobctrl = MPIJobController(registry = registry, auto_resources = False) - registry.register_calc(full_path=dummyappname, calc_type='gen', desc='A dummy calc') - registry.register_calc(full_path=dummyappname, calc_type='sim', desc='A dummy calc') + jobctrl = MPIJobController(auto_resources = False) + jobctrl.register_calc(full_path=dummyappname, calc_type='gen', desc='A dummy calc') + jobctrl.register_calc(full_path=dummyappname, calc_type='sim', desc='A dummy calc') dirname = 'dir_jobc_tests' if os.path.exists(dirname): @@ -62,7 +47,7 @@ def test_job_funcs(): assert jc_triggered, "Failed to raise exception if create job with no app" #Now with no workdir specified - dummyapp = registry.gen_default_app + dummyapp = jobctrl.gen_default_app job1 = Job(app = dummyapp, stdout = 'stdout.txt') wd_exist = job1.workdir_exists() assert not wd_exist #, "No workdir specified, yet workdir_exists does not return False" diff --git a/libensemble/tests/unit_tests/test_jobcontroller.py b/libensemble/tests/unit_tests/test_jobcontroller.py index a72005df3..f94c2c266 100644 --- a/libensemble/tests/unit_tests/test_jobcontroller.py +++ b/libensemble/tests/unit_tests/test_jobcontroller.py @@ -6,10 +6,7 @@ import time import pytest import socket -from libensemble.register import Register, BalsamRegister from libensemble.controller import JobController -from libensemble.mpi_controller import MPIJobController -from libensemble.balsam_controller import BalsamJobController USE_BALSAM = False @@ -23,10 +20,6 @@ def setup_module(module): ctrl = JobController.controller del ctrl JobController.controller = None - if Register.default_registry: - defreg = Register.default_registry - del defreg - Register.default_registry = None def setup_function(function): print ("setup_function function:%s" % function.__name__) @@ -34,10 +27,6 @@ def setup_function(function): ctrl = JobController.controller del ctrl JobController.controller = None - if Register.default_registry: - defreg = Register.default_registry - del defreg - Register.default_registry = None def teardown_module(module): print ("teardown_module module:%s" % module.__name__) @@ -45,10 +34,6 @@ def teardown_module(module): ctrl = JobController.controller del ctrl JobController.controller = None - if Register.default_registry: - defreg = Register.default_registry - del defreg - Register.default_registry = None #def setup_module(module): @@ -73,13 +58,13 @@ def setup_job_controller(): build_simfunc() if USE_BALSAM: - registry = BalsamRegister() - jobctrl = BalsamJobController(registry = registry, auto_resources = False) + from libensemble.balsam_controller import BalsamJobController + jobctrl = BalsamJobController(auto_resources = False) else: - registry = Register() - jobctrl = MPIJobController(registry = registry, auto_resources = False) + from libensemble.mpi_controller import MPIJobController + jobctrl = MPIJobController(auto_resources = False) - registry.register_calc(full_path=sim_app, calc_type='sim') + jobctrl.register_calc(full_path=sim_app, calc_type='sim') def setup_job_controller_noreg(): #sim_app = './my_simjob.x' @@ -87,13 +72,13 @@ def setup_job_controller_noreg(): build_simfunc() if USE_BALSAM: - registry = BalsamRegister() + from libensemble.balsam_controller import BalsamJobController jobctrl = BalsamJobController(auto_resources = False) else: - registry = Register() + from libensemble.mpi_controller import MPIJobController jobctrl = MPIJobController(auto_resources = False) - registry.register_calc(full_path=sim_app, calc_type='sim') + jobctrl.register_calc(full_path=sim_app, calc_type='sim') def setup_job_controller_noapp(): #sim_app = './my_simjob.x' @@ -101,11 +86,11 @@ def setup_job_controller_noapp(): build_simfunc() if USE_BALSAM: - registry = BalsamRegister() - jobctrl = BalsamJobController(registry = registry, auto_resources = False) + from libensemble.balsam_controller import BalsamJobController + jobctrl = BalsamJobController(auto_resources = False) else: - registry = Register() - jobctrl = MPIJobController(registry = registry, auto_resources = False) + from libensemble.mpi_controller import MPIJobController + jobctrl = MPIJobController(auto_resources = False) # ----------------------------------------------------------------------------- # The following would typically be in the user sim_func @@ -403,8 +388,7 @@ def test_launch_as_gen(): else: assert 0 - registry = Register.default_registry - registry.register_calc(full_path=sim_app, calc_type='gen') + jobctl.register_calc(full_path=sim_app, calc_type='gen') job = jobctl.launch(calc_type='gen', num_procs=cores, app_args=args_for_sim) job = polling_loop(jobctl, job) assert job.finished, "job.finished should be True. Returned " + str(job.finished) @@ -431,19 +415,6 @@ def test_launch_default_reg(): assert job.state == 'FINISHED', "job.state should be FINISHED. Returned " + str(job.state) -def test_create_jobcontroller_no_registry(): - print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) - cores = NCORES - args_for_sim = 'sleep 0.1' - #import pdb;pdb.set_trace() - try: - jobctrl = MPIJobController(auto_resources = False) - except: - assert 1 - else: - assert 0 - - def test_launch_no_app(): print("\nTest: {}\n".format(sys._getframe().f_code.co_name)) setup_job_controller_noapp() @@ -474,8 +445,7 @@ def test_kill_job_with_no_launch(): assert 0 # Create a job directly with no launch (Not supported for users) - registry = Register.default_registry - myapp = registry.sim_default_app + myapp = jobctl.sim_default_app job1 = Job(app = myapp, stdout = 'stdout.txt') try: jobctl.kill(job1) @@ -493,8 +463,7 @@ def test_poll_job_with_no_launch(): cores = NCORES # Create a job directly with no launch (Not supported for users) - registry = Register.default_registry - myapp = registry.sim_default_app + myapp = jobctl.sim_default_app job1 = Job(app = myapp, stdout = 'stdout.txt') try: job1.poll() @@ -529,8 +498,6 @@ def test_job_failure(): test_launch_and_kill() test_launch_as_gen() test_launch_default_reg() - setup_function(test_create_jobcontroller_no_registry) - test_create_jobcontroller_no_registry() test_launch_no_app() test_kill_job_with_no_launch() test_poll_job_with_no_launch() From bfa6250403b4305cd0fedd9bbbbf78840def9225 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Mon, 10 Sep 2018 21:48:56 -0500 Subject: [PATCH 056/101] Minor cleanup in controller_tests. --- .../controller_tests/create_balsam_job.py | 4 -- .../test_jobcontroller.manager_poll.py | 11 ++-- .../controller_tests/test_jobcontroller.py | 14 ++--- .../test_jobcontroller_multi.py | 52 +++++++++++-------- 4 files changed, 43 insertions(+), 38 deletions(-) diff --git a/libensemble/tests/controller_tests/create_balsam_job.py b/libensemble/tests/controller_tests/create_balsam_job.py index 18ca7a14f..228f5e8ec 100644 --- a/libensemble/tests/controller_tests/create_balsam_job.py +++ b/libensemble/tests/controller_tests/create_balsam_job.py @@ -26,16 +26,12 @@ def del_jobs(): """ Deletes all jobs """ - import balsam.launcher.dag as dag - from balsam.service import models Job = models.BalsamJob deletion_objs = Job.objects.all() deletion_objs.delete() def add_app(name,exepath,desc): """ Add application to database """ - #import balsam.launcher.dag as dag - #from balsam.service import models AppDef = models.ApplicationDefinition app = AppDef() app.name = name diff --git a/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py b/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py index f1213ef83..e3497c13d 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py +++ b/libensemble/tests/controller_tests/test_jobcontroller.manager_poll.py @@ -17,7 +17,7 @@ def build_simfunc(): #Build simfunc #buildstring='mpif90 -o my_simjob.x my_simjob.f90' # On cray need to use ftn - buildstring='mpicc -o my_simjob.x simdir/my_simjob.c' + buildstring = 'mpicc -o my_simjob.x simdir/my_simjob.c' #subprocess.run(buildstring.split(),check=True) #Python3.5+ subprocess.check_call(buildstring.split()) @@ -45,7 +45,7 @@ def build_simfunc(): from libensemble.mpi_controller import * jobctrl = MPIJobController() -jobctl.register_calc(full_path=sim_app, calc_type='sim') +jobctrl.register_calc(full_path=sim_app, calc_type='sim') #Alternative to IF could be using eg. fstring to specify: e.g: #JOB_CONTROLLER = 'Balsam' @@ -82,11 +82,11 @@ def polling_loop(jobctl, job, timeout_sec=20.0, delay=2.0): if job.finished: if job.state == 'FINISHED': - print('Job finished succesfully. Status:',job.state) + print('Job finished succesfully. Status:', job.state) elif job.state == 'FAILED': - print('Job failed. Status:',job.state) + print('Job failed. Status:', job.state) elif job.state == 'USER_KILLED': - print('Job has been killed. Status:',job.state) + print('Job has been killed. Status:', job.state) else: print('Job status:', job.state) else: @@ -118,4 +118,3 @@ def polling_loop(jobctl, job, timeout_sec=20.0, delay=2.0): job = jobctl.launch(calc_type='sim', num_procs=cores, app_args=args_for_sim) polling_loop(jobctl, job) - diff --git a/libensemble/tests/controller_tests/test_jobcontroller.py b/libensemble/tests/controller_tests/test_jobcontroller.py index 605e6da7e..401c29bdd 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller.py +++ b/libensemble/tests/controller_tests/test_jobcontroller.py @@ -8,7 +8,7 @@ def build_simfunc(): #Build simfunc #buildstring='mpif90 -o my_simjob.x my_simjob.f90' # On cray need to use ftn - buildstring='mpicc -o my_simjob.x simdir/my_simjob.c' + buildstring = 'mpicc -o my_simjob.x simdir/my_simjob.c' #subprocess.run(buildstring.split(),check=True) #Python3.5+ subprocess.check_call(buildstring.split()) @@ -36,7 +36,7 @@ def build_simfunc(): from libensemble.mpi_controller import * jobctrl = MPIJobController() -jobctl.register_calc(full_path=sim_app, calc_type='sim') +jobctrl.register_calc(full_path=sim_app, calc_type='sim') #Alternative to IF could be using eg. fstring to specify: e.g: #JOB_CONTROLLER = 'Balsam' @@ -68,11 +68,11 @@ def polling_loop(jobctl, job, timeout_sec=20.0, delay=2.0): if job.finished: if job.state == 'FINISHED': - print('Job finished succesfully. Status:',job.state) + print('Job finished succesfully. Status:', job.state) elif job.state == 'FAILED': - print('Job failed. Status:',job.state) + print('Job failed. Status:', job.state) elif job.state == 'USER_KILLED': - print('Job has been killed. Status:',job.state) + print('Job has been killed. Status:', job.state) else: print('Job status:', job.state) else: @@ -87,7 +87,8 @@ def polling_loop(jobctl, job, timeout_sec=20.0, delay=2.0): # Tests -#From worker call JobController by different name to ensure getting registered app from JobController +#From worker call JobController by different name to ensure +#getting registered app from JobController jobctl = JobController.controller print('\nTest 1 - should complete succesfully with status FINISHED :\n') @@ -103,4 +104,3 @@ def polling_loop(jobctl, job, timeout_sec=20.0, delay=2.0): job = jobctl.launch(calc_type='sim', num_procs=cores, app_args=args_for_sim) polling_loop(jobctl, job) - diff --git a/libensemble/tests/controller_tests/test_jobcontroller_multi.py b/libensemble/tests/controller_tests/test_jobcontroller_multi.py index bec0d4f10..1309f7053 100644 --- a/libensemble/tests/controller_tests/test_jobcontroller_multi.py +++ b/libensemble/tests/controller_tests/test_jobcontroller_multi.py @@ -1,7 +1,8 @@ -#Test of job controller running multiple jobs for libensemble -#Could support hybrid mode - including, eg. running multi jobs per node (launched locally), or -#simply sharing burden on central system/consecutive pipes to balsam database - could enable -#use of threads if supply run-directories rather than assuming in-place runs etc.... +#Test of job controller running multiple jobs for libensemble. Could support +#hybrid mode - including, eg. running multi jobs per node (launched locally), +#or simply sharing burden on central system/consecutive pipes to balsam +#database - could enable use of threads if supply run-directories rather than +#assuming in-place runs etc.... #Test does not require running full libensemble import os @@ -11,7 +12,7 @@ def build_simfunc(): #Build simfunc #buildstring='mpif90 -o my_simjob.x my_simjob.f90' # On cray need to use ftn - buildstring='mpicc -o my_simjob.x simdir/my_simjob.c' + buildstring = 'mpicc -o my_simjob.x simdir/my_simjob.c' #subprocess.run(buildstring.split(),check=True) #Python3.5+ subprocess.check_call(buildstring.split()) @@ -64,21 +65,26 @@ def polling_loop(jobctl, job_list, timeout_sec=40.0, delay=1.0): for job in job_list: if not job.finished: time.sleep(delay) - print('Polling job %d at time %f' % (job.id, time.time() - start)) + print('Polling job {0} at time {1}'. + format(job.id, time.time() - start)) job.poll() if job.finished: continue - elif job.state == 'WAITING': print('Job %d waiting to launch' % (job.id)) - elif job.state == 'RUNNING': print('Job %d still running ....' % (job.id)) + elif job.state == 'WAITING': + print('Job {0} waiting to launch'.format(job.id)) + elif job.state == 'RUNNING': + print('Job {0} still running ....'.format(job.id)) #Check output file for error if job.stdout_exists(): if 'Error' in job.read_stdout(): - print("Found (deliberate) Error in ouput file - cancelling job %d" % (job.id)) + print("Found (deliberate) Error in ouput file - " + "cancelling job {}".format(job.id)) jobctl.kill(job) time.sleep(delay) #Give time for kill continue - #But if I want to do something different - I want to make a file - no function for THAT! + #But if I want to do something different - + # I want to make a file - no function for THAT! #But you can get all the job attributes! #Uncomment to test #path = os.path.join(job.workdir,'newfile'+str(time.time())) @@ -89,26 +95,31 @@ def polling_loop(jobctl, job_list, timeout_sec=40.0, delay=1.0): for job in job_list: if job.finished: if job.state == 'FINISHED': - print('Job %d finished succesfully. Status: %s' % (job.id, job.state)) + print('Job {0} finished succesfully. Status: {1}'. + format(job.id, job.state)) elif job.state == 'FAILED': - print('Job %d failed. Status: %s' % (job.id, job.state)) + print('Job {0} failed. Status: {1}'. + format(job.id, job.state)) elif job.state == 'USER_KILLED': - print('Job %d has been killed. Status: %s' % (job.id, job.state)) + print('Job {0} has been killed. Status: {1}'. + format(job.id, job.state)) else: - print('Job %d status: %s' % (job.id, job.state)) + print('Job {0} status: {1}'.format(job.id, job.state)) else: - print('Job %d timed out. Status: %s' % (job.id, job.state)) + print('Job {0} timed out. Status: {1}'.format(job.id, job.state)) jobctl.kill(job) if job.finished: - print('Job %d Now killed. Status: %s' % (job.id, job.state)) + print('Job {0} Now killed. Status: {1}'. + format(job.id, job.state)) #double check job.poll() - print('Job %d state is %s' % (job.id, job.state)) + print('Job {0} state is {1}'.format(job.id, job.state)) # Tests -#From worker call JobController by different name to ensure getting registered app from JobController +#From worker call JobController by different name to ensure getting registered +#app from JobController jobctl = JobController.controller @@ -118,7 +129,8 @@ def polling_loop(jobctl, job_list, timeout_sec=40.0, delay=1.0): cores = 4 for j in range(3): - #outfilename = 'out_' + str(j) + '.txt' #Could allow launch to generate outfile names based on job.id + ##Could allow launch to generate outfile names based on job.id + #outfilename = 'out_' + str(j) + '.txt' sleeptime = 6 + j*3 #Change args args_for_sim = 'sleep' + ' ' + str(sleeptime) rundir = 'run_' + str(sleeptime) @@ -127,5 +139,3 @@ def polling_loop(jobctl, job_list, timeout_sec=40.0, delay=1.0): polling_loop(jobctl, job_list) - - From ec55613bed451ef3ebde3b80bd282f6b432a5c10 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Mon, 10 Sep 2018 22:29:02 -0500 Subject: [PATCH 057/101] Add job stderr testing. --- libensemble/tests/unit_tests/test_job_funcs.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/libensemble/tests/unit_tests/test_job_funcs.py b/libensemble/tests/unit_tests/test_job_funcs.py index e24adae2a..8ac2dbffc 100644 --- a/libensemble/tests/unit_tests/test_job_funcs.py +++ b/libensemble/tests/unit_tests/test_job_funcs.py @@ -41,14 +41,14 @@ def test_job_funcs(): #First try no app - check exception raised? jc_triggered = False try: - job = Job(workdir = myworkdir, stdout = 'stdout.txt') + job = Job(workdir=myworkdir, stdout='stdout.txt', stderr='stderr.txt') except JobControllerException: jc_triggered = True assert jc_triggered, "Failed to raise exception if create job with no app" #Now with no workdir specified dummyapp = jobctrl.gen_default_app - job1 = Job(app = dummyapp, stdout = 'stdout.txt') + job1 = Job(app=dummyapp, stdout='stdout.txt', stderr='stderr.txt') wd_exist = job1.workdir_exists() assert not wd_exist #, "No workdir specified, yet workdir_exists does not return False" stdout_exist = job1.stdout_exists() @@ -57,7 +57,7 @@ def test_job_funcs(): assert not f_exist #Create job properly specified - job2 = Job(app = dummyapp, workdir = myworkdir ,stdout = 'stdout.txt') + job2 = Job(app=dummyapp, workdir=myworkdir, stdout='stdout.txt', stderr='stderr.txt') #Workdir does exist wd_exist = job2.workdir_exists() @@ -66,6 +66,8 @@ def test_job_funcs(): #Files do not exist stdout_exist = job2.stdout_exists() assert not stdout_exist + stderr_exist = job2.stderr_exists() + assert not stderr_exist f_exist = job2.file_exists_in_workdir('running_output.txt') assert not f_exist @@ -86,6 +88,8 @@ def test_job_funcs(): #Now create files and check positive results with open("stdout.txt","w") as f: f.write('This is stdout') + with open("stderr.txt","w") as f: + f.write('This is stderr') with open("running_output.txt","w") as f: f.write('This is running output') @@ -94,9 +98,12 @@ def test_job_funcs(): #assert wd_exist stdout_exist = job2.stdout_exists() assert stdout_exist + stderr_exist = job2.stderr_exists() + assert stderr_exist f_exist = job2.file_exists_in_workdir('running_output.txt') assert f_exist assert 'This is stdout' in job2.read_stdout() + assert 'This is stderr' in job2.read_stderr() assert 'This is running output' in job2.read_file_in_workdir('running_output.txt') #Check if workdir does not exist From ba5cadfbc6f4eb9cd262c6198459008d0e570bc4 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Mon, 10 Sep 2018 22:37:32 -0500 Subject: [PATCH 058/101] Added basic unit tests for calc_job_timing. --- libensemble/tests/unit_tests/test_job_funcs.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/libensemble/tests/unit_tests/test_job_funcs.py b/libensemble/tests/unit_tests/test_job_funcs.py index 8ac2dbffc..eb942b825 100644 --- a/libensemble/tests/unit_tests/test_job_funcs.py +++ b/libensemble/tests/unit_tests/test_job_funcs.py @@ -1,5 +1,6 @@ import os import shutil +import time from libensemble.controller import Job, JobController, JobControllerException from libensemble.mpi_controller import MPIJobController @@ -111,6 +112,15 @@ def test_job_funcs(): wd_exist = job2.workdir_exists() assert not wd_exist + # Check timing + assert not job2.launch_time and not job2.runtime and not job2.total_time + job2.calc_job_timing() + assert not job2.launch_time and not job2.runtime and not job2.total_time + job2.launch_time = time.time() + job2.calc_job_timing() + assert job2.runtime is not None and job2.runtime == job2.total_time + + # Clean up os.chdir('../') shutil.rmtree(dirname) From 1650c08f7619c6ecce794e70627519d4a06fc844 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Mon, 10 Sep 2018 22:42:34 -0500 Subject: [PATCH 059/101] Added double calc_job_timing call test. --- libensemble/tests/unit_tests/test_job_funcs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libensemble/tests/unit_tests/test_job_funcs.py b/libensemble/tests/unit_tests/test_job_funcs.py index eb942b825..f74c59590 100644 --- a/libensemble/tests/unit_tests/test_job_funcs.py +++ b/libensemble/tests/unit_tests/test_job_funcs.py @@ -119,6 +119,10 @@ def test_job_funcs(): job2.launch_time = time.time() job2.calc_job_timing() assert job2.runtime is not None and job2.runtime == job2.total_time + save_runtime, save_total_time = job2.runtime, job2.total_time + job2.calc_job_timing() + assert save_runtime == job2.runtime + assert save_total_time == job2.total_time # Clean up os.chdir('../') From c7815022754b5425d4dc90e7a0eb223be71ab787 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Fri, 21 Sep 2018 09:23:05 -0500 Subject: [PATCH 060/101] Adding one more NLopt method to aposmm --- libensemble/gen_funcs/aposmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index 532ac2723..9a37ea253 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -391,7 +391,7 @@ def advance_localopt_method(H, gen_specs, c_flag, run, persis_info): x_new = np.ones((1,len(gen_specs['ub'])))*np.inf; pt_in_run = 0; total_pts_in_run = len(sorted_run_inds) - if gen_specs['localopt_method'] in ['LN_SBPLX', 'LN_BOBYQA', 'LN_NELDERMEAD', 'LD_MMA']: + if gen_specs['localopt_method'] in ['LN_SBPLX', 'LN_BOBYQA', 'LN_COBYLA', 'LN_NELDERMEAD', 'LD_MMA']: if gen_specs['localopt_method'] in ['LD_MMA']: fields_to_pass = ['x_on_cube','f','grad'] From 325c7eee83939bb0cc94420a5e1670ddbf08732a Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Wed, 3 Oct 2018 06:08:45 -0500 Subject: [PATCH 061/101] Starting an improved aposmm branch From 0a4e41846fe64c59c7f33b1b4ba8914aa29657f3 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Wed, 3 Oct 2018 07:27:23 -0500 Subject: [PATCH 062/101] Just returning r_k instead of rk_const from initialize aposmm --- .../start_persistent_local_opt_gens.py | 4 ++-- libensemble/gen_funcs/aposmm.py | 18 +++++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py index 613ba17e8..5fa60432f 100644 --- a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py +++ b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py @@ -58,9 +58,9 @@ def start_persistent_local_opt_gens(W, H, sim_specs, gen_specs, persis_info): for i in avail_worker_ids(W, persistent=False): # Find candidates to start local opt runs if a sample has been evaluated if np.any(np.logical_and(~H['local_pt'], H['returned'])): - _, n_s, _, _, rk_const, lhs_divisions, mu, nu = initialize_APOSMM(H, gen_specs) + _, n_s, _, _, r_k, mu, nu = initialize_APOSMM(H, gen_specs) update_history_dist(H, gen_specs, c_flag=False) - starting_inds = decide_where_to_start_localopt(H, n_s, rk_const, lhs_divisions, mu, nu) + starting_inds = decide_where_to_start_localopt(H, n_s, r_k, mu, nu) else: starting_inds = [] diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index 532ac2723..33c18705f 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -132,7 +132,7 @@ def aposmm_logic(H,persis_info,gen_specs,_): """ - n, n_s, c_flag, O, rk_const, lhs_divisions, mu, nu = initialize_APOSMM(H, gen_specs) + n, n_s, c_flag, O, r_k, mu, nu = initialize_APOSMM(H, gen_specs) # np.savez('H'+str(len(H)),H=H,gen_specs=gen_specs,persis_info=persis_info) if n_s < gen_specs['initial_sample_size']: @@ -143,7 +143,7 @@ def aposmm_logic(H,persis_info,gen_specs,_): updated_inds = update_history_dist(H, gen_specs, c_flag) - starting_inds = decide_where_to_start_localopt(H, n_s, rk_const, lhs_divisions, mu, nu) + starting_inds = decide_where_to_start_localopt(H, n_s, r_k, mu, nu) updated_inds.update(starting_inds) for ind in starting_inds: @@ -586,7 +586,7 @@ def pounders_obj_func(tao, X, F, Run_H): -def decide_where_to_start_localopt(H, n_s, rk_const, lhs_divisions=0, mu=0, nu=0, gamma_quantile=1): +def decide_where_to_start_localopt(H, n_s, r_k, mu=0, nu=0, gamma_quantile=1): """ Finds points in the history that satisfy the conditions (S1-S5 and L1-L8) in Table 1 of the `APOSMM paper `_ @@ -607,8 +607,8 @@ def decide_where_to_start_localopt(H, n_s, rk_const, lhs_divisions=0, mu=0, nu=0 History array storing rows for each point. n_s: integer Number of sample points - rk_const: float - Constant in front of r_k evaluation + r_k_const: float + Radius for deciding when to start runs lhs_divisions: integer Number of Latin hypercube sampling divisions (0 or 1 means uniform random sampling over the domain) @@ -631,7 +631,6 @@ def decide_where_to_start_localopt(H, n_s, rk_const, lhs_divisions=0, mu=0, nu=0 """ n = len(H['x_on_cube'][0]) - r_k = calc_rk(n, n_s, rk_const, lhs_divisions) if nu > 0: test_2_through_5 = np.logical_and.reduce(( @@ -861,7 +860,12 @@ def initialize_APOSMM(H, gen_specs): else: nu = 0 - return n, n_s, c_flag, O, rk_c, ld, mu, nu + if n_s > 0: + r_k = calc_rk(n, n_s, rk_c, ld) + else: + r_k = np.inf + + return n, n_s, c_flag, O, r_k, mu, nu def queue_update_function(H, gen_specs, persis_info): From 2aaafa2cbe18fb67b706d20f2ca51b5b85f0178e Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Wed, 3 Oct 2018 07:31:02 -0500 Subject: [PATCH 063/101] Removing old L8 test code from aposmm --- libensemble/gen_funcs/aposmm.py | 75 +-------------------------------- 1 file changed, 1 insertion(+), 74 deletions(-) diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index 33c18705f..2a7e4d3d6 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -683,80 +683,7 @@ def decide_where_to_start_localopt(H, n_s, r_k, mu=0, nu=0, gamma_quantile=1): local_start_inds2 = list(np.ix_(local_seeds)[0]) - # if ignore_L8: - # if True: - # local_start_inds2 = list(np.ix_(local_seeds)[0]) - # else: - # # ### For L8, search for an rk-ascent path for a sample point - # # lb = np.zeros(n) - # # ub = np.ones(n) - # # local_start_inds = [] - # # for i in np.ix_(local_seeds)[0]: - # # old_local_on_rk_ascent = np.array(np.zeros(len(H)), dtype=bool) - # # local_on_rk_ascent = np.array(np.eye(len(H))[i,:], dtype=bool) - - # # done_with_i = False - # # while not done_with_i and not np.array_equiv(old_local_on_rk_ascent, local_on_rk_ascent): - # # old_local_on_rk_ascent = local_on_rk_ascent.copy() - # # to_add = np.array(np.zeros(len(H)),dtype=bool) - # # for j in np.ix_(local_on_rk_ascent)[0]: - # # if keep_pdist: - # # samples_on_rk_ascent_from_j = np.logical_and.reduce((H['f'][j] <= H['f'], ~H['local_pt'], H['dist_to_all'][:,j] <= r_k)) - # # else: - # # ind_of_last = np.max(np.ix_(H['returned'])) - # # pdist_vec = cdist([H['x_on_cube'][j]], H['x_on_cube'][:ind_of_last+1], 'euclidean').flatten() - # # pdist_vec = np.append(pdist_vec, np.zeros(len(H)-ind_of_last-1)) - # # samples_on_rk_ascent_from_j = np.logical_and.reduce((H['f'][j] <= H['f'], ~H['local_pt'], pdist_vec <= r_k)) - - # # if np.any(np.logical_and(samples_on_rk_ascent_from_j, sample_seeds)): - # # done_with_i = True - # # local_start_inds.append(i) - # # break - - # # if keep_pdist: - # # feasible_locals_on_rk_ascent_from_j = np.logical_and.reduce((H['f'][j] <= H['f'], - # # np.all(ub - H['x_on_cube'] >= 0, axis=1), - # # np.all(H['x_on_cube'] - lb >= 0, axis=1), - # # H['local_pt'], - # # H['dist_to_all'][:,j] <= r_k - # # )) - # # else: - # # feasible_locals_on_rk_ascent_from_j = np.logical_and.reduce((H['f'][j] <= H['f'], - # # np.all(ub - H['x_on_cube'] >= 0, axis=1), - # # np.all(H['x_on_cube'] - lb >= 0, axis=1), - # # H['local_pt'], - # # pdist_vec <= r_k - # # )) - - # # to_add = np.logical_or(to_add, feasible_locals_on_rk_ascent_from_j) - # # local_on_rk_ascent = to_add.copy() - - # # if not done_with_i: - # # # sys.exit("We have an i satisfying (L1-L7) but failing L8") - # # print("\n\n We have ind %d satisfying (L1-L7) but failing L8 \n\n" % i) - - # # ### Faster L8 test - # local_start_inds2 = [] - # for i in np.ix_(local_seeds)[0]: - # old_pts_on_rk_ascent = np.array(np.zeros(len(H)), dtype=bool) - # pts_on_rk_ascent = H['worse_within_rk'][i] - - # done_with_i = False - # while not done_with_i and not np.array_equiv(old_pts_on_rk_ascent, pts_on_rk_ascent): - # old_pts_on_rk_ascent = pts_on_rk_ascent.copy() - # to_add = np.array(np.zeros(len(H)),dtype=bool) - # for j in np.ix_(pts_on_rk_ascent)[0]: - # to_add = np.logical_or(to_add, H['worse_within_rk'][i]) - # pts_on_rk_ascent = to_add - # if np.any(np.logical_and(to_add, sample_seeds)): - # done_with_i = True - # local_start_inds2.append(i) - # break - # if not done_with_i: - # print("Again, we have ind %d satisfying (L1-L7) but failing L8\n" % i) - - # # assert local_start_inds.sort() == local_start_inds2.sort(), "Something didn't match up" - # # start_inds = list(sample_start_inds) + local_start_inds + start_inds = list(sample_start_inds) + local_start_inds2 return start_inds From b29c9a0202dfa25435a3ea433cd8693757df43ad Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Wed, 3 Oct 2018 11:32:11 -0500 Subject: [PATCH 064/101] A version of APOSMM that limits the number of active runs --- libensemble/gen_funcs/aposmm.py | 13 +++++++++++++ .../tests/regression_tests/test_branin_aposmm.py | 1 + 2 files changed, 14 insertions(+) diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index 2a7e4d3d6..13d5e81d3 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -162,6 +162,19 @@ def aposmm_logic(H,persis_info,gen_specs,_): persis_info['active_runs'].update([new_run_num]) persis_info['total_runs'] +=1 + if 'max_active_runs' in gen_specs: + num_runs = len(persis_info['run_order']) + run_vals = np.zeros((num_runs,2)) + for i,run in enumerate(persis_info['run_order'].keys()): + run_vals[i,0] = run + run_vals[i,1] = np.min(H['f'][persis_info['run_order'][run]]) + + + num_active_runs = min(gen_specs['max_active_runs'],num_runs) + k_sorted = np.argpartition(run_vals[:,1],kth=num_active_runs-1) + + persis_info['active_runs'] = set(run_vals[k_sorted[:num_active_runs],0].astype(int)) + inactive_runs = set() # Find next point in any uncompleted runs using information stored in persis_info diff --git a/libensemble/tests/regression_tests/test_branin_aposmm.py b/libensemble/tests/regression_tests/test_branin_aposmm.py index 740d2d9f8..cdb7a9fb4 100644 --- a/libensemble/tests/regression_tests/test_branin_aposmm.py +++ b/libensemble/tests/regression_tests/test_branin_aposmm.py @@ -75,6 +75,7 @@ 'num_active_gens': 1, 'batch_mode': True, 'high_priority_to_best_localopt_runs': True, + 'max_active_runs': 3, } # Tell libEnsemble when to stop From 222cd173b87b0623608b5f39d370e7af27b2759d Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Wed, 3 Oct 2018 14:33:03 -0500 Subject: [PATCH 065/101] Moving some aposmm exception code to one function --- libensemble/gen_funcs/aposmm.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index 13d5e81d3..9ed9cc00d 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -415,15 +415,7 @@ def advance_localopt_method(H, gen_specs, c_flag, run, persis_info): x_opt, exit_code = set_up_and_run_nlopt(H[fields_to_pass][sorted_run_inds], gen_specs) except Exception as e: exit_code = 0 - print(e.__doc__) - print(e.args) - print("These are the points in the run that has failed:", H['x_on_cube'][sorted_run_inds]) - _, _, tb = sys.exc_info() - traceback.print_tb(tb) # Fixed format - tb_info = traceback.extract_tb(tb) - filename, line, func, text = tb_info[-1] - print('An error occurred on line {} in statement {}'.format(line, text)) - + display_exception(e) elif gen_specs['localopt_method'] in ['pounders']: @@ -440,14 +432,8 @@ def advance_localopt_method(H, gen_specs, c_flag, run, persis_info): x_opt, exit_code = set_up_and_run_tao(Run_H, gen_specs) except Exception as e: exit_code = 0 - print(e.__doc__) - print(e.args) - print("These are the points in the run that has failed:", Run_H['x_on_cube']) - _, _, tb = sys.exc_info() - traceback.print_tb(tb) # Fixed format - tb_info = traceback.extract_tb(tb) - filename, line, func, text = tb_info[-1] - print('An error occurred on line {} in statement {}'.format(line, text)) + display_exception(e) + else: sys.exit("Unknown localopt method. Exiting") @@ -873,6 +859,14 @@ def queue_update_function(H, gen_specs, persis_info): return persis_info +def display_exception(e): + print(e.__doc__) + print(e.args) + _, _, tb = sys.exc_info() + traceback.print_tb(tb) # Fixed format + tb_info = traceback.extract_tb(tb) + filename, line, func, text = tb_info[-1] + print('An error occurred on line {} in statement {}'.format(line, text)) # if __name__ == "__main__": # [H,gen_specs,persis_info] = [np.load('H20.npz')[i] for i in ['H','gen_specs','persis_info']] From 169af0d59204c81ccf766ad93213cb84246418c6 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Wed, 3 Oct 2018 14:39:37 -0500 Subject: [PATCH 066/101] A commented out scipy.optimize call --- libensemble/gen_funcs/aposmm.py | 48 +++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index 9ed9cc00d..56bc71205 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -13,6 +13,8 @@ import numpy as np # import scipy as sp from scipy.spatial.distance import cdist +from scipy import optimize as scipy_optimize + from mpi4py import MPI from numpy.lib.recfunctions import merge_arrays @@ -434,6 +436,17 @@ def advance_localopt_method(H, gen_specs, c_flag, run, persis_info): exit_code = 0 display_exception(e) + # elif gen_specs['localopt_method'] in ['COBYLA']: + + # fields_to_pass = ['x_on_cube','f'] + + # try: + # x_opt, exit_code = set_up_and_run_scipy_minimize(H[fields_to_pass][sorted_run_inds], gen_specs) + # except Exception as e: + # exit_code = 0 + # display_exception(e) + + else: sys.exit("Unknown localopt method. Exiting") @@ -449,6 +462,41 @@ def advance_localopt_method(H, gen_specs, c_flag, run, persis_info): +# def set_up_and_run_scipy_minimize(Run_H, gen_specs): +# """ Set up objective and runs scipy + +# Declares the appropriate syntax for our special objective function to read +# through Run_H, sets the parameters and starting points for the run. +# """ + +# def scipy_obj_fun(x, Run_H): +# out = look_in_history(x, Run_H) + +# return out + +# obj = lambda x: scipy_obj_fun(x, Run_H) +# x0 = Run_H['x_on_cube'][0] + + +# import ipdb; ipdb.set_trace() +# #construct the bounds in the form of constraints +# cons = [] +# for factor in range(len(x0)): +# l = {'type': 'ineq', +# 'fun': lambda x, lb=gen_specs['lb'][factor], i=factor: x[i] - lb} +# u = {'type': 'ineq', +# 'fun': lambda x, ub=gen_specs['ub'][factor], i=factor: ub - x[i]} +# cons.append(l) +# cons.append(u) + +# res = scipy_optimize.minimize(obj,x0,method=gen_specs['localopt_method'],options={'maxiter':len(Run_H['x_on_cube'])+1}) + +# if res['status'] == 2: # SciPy code for exhausting budget of evaluations, so not at a minimum +# exit_code = 0 + +# x_opt = res['x'] +# return x_opt, exit_code + def set_up_and_run_nlopt(Run_H, gen_specs): """ Set up objective and runs nlopt From 645cae16b7f0f9a203b538535aa9c909cb183122 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Thu, 4 Oct 2018 11:18:04 -0500 Subject: [PATCH 067/101] Fixing the logic of APOSMM in deciding where to give evaluations --- .../start_persistent_local_opt_gens.py | 4 +-- libensemble/gen_funcs/aposmm.py | 32 +++++++++++-------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py index 5fa60432f..f46dc8328 100644 --- a/libensemble/alloc_funcs/start_persistent_local_opt_gens.py +++ b/libensemble/alloc_funcs/start_persistent_local_opt_gens.py @@ -58,9 +58,9 @@ def start_persistent_local_opt_gens(W, H, sim_specs, gen_specs, persis_info): for i in avail_worker_ids(W, persistent=False): # Find candidates to start local opt runs if a sample has been evaluated if np.any(np.logical_and(~H['local_pt'], H['returned'])): - _, n_s, _, _, r_k, mu, nu = initialize_APOSMM(H, gen_specs) + _, _, _, _, r_k, mu, nu = initialize_APOSMM(H, gen_specs) update_history_dist(H, gen_specs, c_flag=False) - starting_inds = decide_where_to_start_localopt(H, n_s, r_k, mu, nu) + starting_inds = decide_where_to_start_localopt(H, r_k, mu, nu) else: starting_inds = [] diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index 56bc71205..249f1ea76 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -12,8 +12,8 @@ import sys, os, traceback import numpy as np # import scipy as sp -from scipy.spatial.distance import cdist -from scipy import optimize as scipy_optimize +from scipy.spatial.distance import cdist, pdist, squareform +# from scipy import optimize as scipy_optimize from mpi4py import MPI @@ -145,7 +145,7 @@ def aposmm_logic(H,persis_info,gen_specs,_): updated_inds = update_history_dist(H, gen_specs, c_flag) - starting_inds = decide_where_to_start_localopt(H, n_s, r_k, mu, nu) + starting_inds = decide_where_to_start_localopt(H, r_k, mu, nu) updated_inds.update(starting_inds) for ind in starting_inds: @@ -164,18 +164,24 @@ def aposmm_logic(H,persis_info,gen_specs,_): persis_info['active_runs'].update([new_run_num]) persis_info['total_runs'] +=1 - if 'max_active_runs' in gen_specs: - num_runs = len(persis_info['run_order']) - run_vals = np.zeros((num_runs,2)) + num_runs = len(persis_info['run_order']) + if 'max_active_runs' in gen_specs and gen_specs['max_active_runs'] < num_runs: + run_vals = np.zeros((num_runs,2),dtype=int) for i,run in enumerate(persis_info['run_order'].keys()): run_vals[i,0] = run - run_vals[i,1] = np.min(H['f'][persis_info['run_order'][run]]) + run_vals[i,1] = persis_info['run_order'][run][np.nanargmin(H['f'][persis_info['run_order'][run]])] + P = squareform(pdist(H['x_on_cube'][run_vals[:,1]], 'euclidean')) + dist_to_better = np.inf*np.ones(num_runs) + + for i in range(num_runs): + better = H['f'][run_vals[:,1]]`_ @@ -652,8 +658,6 @@ def decide_where_to_start_localopt(H, n_s, r_k, mu=0, nu=0, gamma_quantile=1): ---------- H: numpy structured array History array storing rows for each point. - n_s: integer - Number of sample points r_k_const: float Radius for deciding when to start runs lhs_divisions: integer From d7d3db16f829a0816886f4f251892482d32f4e3f Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Fri, 5 Oct 2018 15:18:39 -0500 Subject: [PATCH 068/101] Saving info from completed runs to persis_info --- libensemble/gen_funcs/aposmm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index 249f1ea76..26aadd74f 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -153,6 +153,7 @@ def aposmm_logic(H,persis_info,gen_specs,_): if not np.any(H['started_run']): persis_info['active_runs'] = set() persis_info['run_order'] = {} + persis_info['old_runs'] = {} persis_info['total_runs'] = 0 new_run_num = persis_info['total_runs'] @@ -207,7 +208,8 @@ def aposmm_logic(H,persis_info,gen_specs,_): for i in inactive_runs: persis_info['active_runs'].remove(i) - persis_info['run_order'].pop(i) # Deletes any information about this run + old_run = persis_info['run_order'].pop(i) # Deletes any information about this run + persis_info['old_runs'][i] = old_run if len(H) == 0: samples_needed = gen_specs['initial_sample_size'] From 92c4415c6df7d0a6b07098a68ea8faf5987be9b5 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Fri, 5 Oct 2018 15:56:15 -0500 Subject: [PATCH 069/101] Adding back COBYLA --- libensemble/gen_funcs/aposmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index 26aadd74f..fc34b4ba6 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -414,7 +414,7 @@ def advance_localopt_method(H, gen_specs, c_flag, run, persis_info): x_new = np.ones((1,len(gen_specs['ub'])))*np.inf; pt_in_run = 0; total_pts_in_run = len(sorted_run_inds) - if gen_specs['localopt_method'] in ['LN_SBPLX', 'LN_BOBYQA', 'LN_NELDERMEAD', 'LD_MMA']: + if gen_specs['localopt_method'] in ['LN_SBPLX', 'LN_BOBYQA', 'LN_COBYLA', 'LN_NELDERMEAD', 'LD_MMA']: if gen_specs['localopt_method'] in ['LD_MMA']: fields_to_pass = ['x_on_cube','f','grad'] From 86775406f31b921981826244924d20b08385bdab Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 9 Oct 2018 09:31:40 -0500 Subject: [PATCH 070/101] Adding 'mock' to list of required packages for tests. --- README.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 7fdf55359..ea48670a9 100644 --- a/README.rst +++ b/README.rst @@ -79,7 +79,9 @@ regularly on: * `Travis CI `_ -The test suite requires the pytest, pytest-cov and pytest-timeout packages to be installed and can be run from the libensemble/tests directory of the source distribution by running:: +The test suite requires the mock, pytest, pytest-cov and pytest-timeout +packages to be installed and can be run from the libensemble/tests directory of +the source distribution by running:: ./run-tests.sh From af7e98cf208da8149807ad5dcb9f9fefe937d748 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 9 Oct 2018 10:50:05 -0500 Subject: [PATCH 071/101] Ensuring persis_info fields are declared, even if there are no runs yet started. --- libensemble/gen_funcs/aposmm.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index fc34b4ba6..b9d3a51f9 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -148,14 +148,14 @@ def aposmm_logic(H,persis_info,gen_specs,_): starting_inds = decide_where_to_start_localopt(H, r_k, mu, nu) updated_inds.update(starting_inds) + if not np.any(H['started_run']): + persis_info['active_runs'] = set() + persis_info['run_order'] = {} + persis_info['old_runs'] = {} + persis_info['total_runs'] = 0 + for ind in starting_inds: # Find the run number - if not np.any(H['started_run']): - persis_info['active_runs'] = set() - persis_info['run_order'] = {} - persis_info['old_runs'] = {} - persis_info['total_runs'] = 0 - new_run_num = persis_info['total_runs'] H['started_run'][ind] = 1 From 782248eef77dd29244943b2d73c2479700387615 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 9 Oct 2018 14:54:41 -0500 Subject: [PATCH 072/101] Allowing APOSMM to be called when not all runs are completed --- libensemble/gen_funcs/aposmm.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index b9d3a51f9..f05e5aace 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -148,12 +148,6 @@ def aposmm_logic(H,persis_info,gen_specs,_): starting_inds = decide_where_to_start_localopt(H, r_k, mu, nu) updated_inds.update(starting_inds) - if not np.any(H['started_run']): - persis_info['active_runs'] = set() - persis_info['run_order'] = {} - persis_info['old_runs'] = {} - persis_info['total_runs'] = 0 - for ind in starting_inds: # Find the run number new_run_num = persis_info['total_runs'] @@ -188,6 +182,8 @@ def aposmm_logic(H,persis_info,gen_specs,_): # Find next point in any uncompleted runs using information stored in persis_info for run in persis_info['active_runs']: + if not np.all(H['returned'][persis_info['run_order'][run]]): + continue # Can't advance this run since all of it's points haven't been returned. x_opt, exit_code, persis_info, sorted_run_inds = advance_localopt_method(H, gen_specs, c_flag, run, persis_info) @@ -410,8 +406,6 @@ def advance_localopt_method(H, gen_specs, c_flag, run, persis_info): while 1: sorted_run_inds = persis_info['run_order'][run] - assert all(H['returned'][sorted_run_inds]) - x_new = np.ones((1,len(gen_specs['ub'])))*np.inf; pt_in_run = 0; total_pts_in_run = len(sorted_run_inds) if gen_specs['localopt_method'] in ['LN_SBPLX', 'LN_BOBYQA', 'LN_COBYLA', 'LN_NELDERMEAD', 'LD_MMA']: From 567800d576d206e8885b7c65d3e65e61aaf45fa6 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 9 Oct 2018 16:20:06 -0500 Subject: [PATCH 073/101] Using new persis_info in all APOSMM calls --- .../alloc_funcs/fast_alloc_to_aposmm.py | 25 +++++++++++++------ .../test_6-hump_camel_aposmm_LD_MMA.py | 16 +++++++++--- .../regression_tests/test_branin_aposmm.py | 6 +++++ ...t_chwirut_aposmm_one_residual_at_a_time.py | 7 ++++++ .../regression_tests/test_chwirut_pounders.py | 7 ++++++ ...uniform_sampling_one_residual_at_a_time.py | 7 ++++++ 6 files changed, 58 insertions(+), 10 deletions(-) diff --git a/libensemble/alloc_funcs/fast_alloc_to_aposmm.py b/libensemble/alloc_funcs/fast_alloc_to_aposmm.py index 5f21754b8..4d15a96d1 100644 --- a/libensemble/alloc_funcs/fast_alloc_to_aposmm.py +++ b/libensemble/alloc_funcs/fast_alloc_to_aposmm.py @@ -31,20 +31,31 @@ def give_sim_work_first(W, H, sim_specs, gen_specs, persis_info): persis_info['next_to_give'] += 1 elif gen_count < gen_specs.get('num_active_gens', gen_count+1): + lw = persis_info['last_worker'] - # Don't give gen instances in batch mode if points are unfinished last_size = persis_info.get('last_size') - if (gen_specs.get('batch_mode') - and len(H) + if len(H): + # Don't give gen instances in batch mode if points are unfinished + if (gen_specs.get('batch_mode') and not all(np.logical_or(H['returned'][last_size:], H['paused'][last_size:]))): - break - else: - persis_info['last_size'] = len(H) + break + # Don't call APOSMM if there are runs going but none need advancing + if len(persis_info[lw]['run_order']): + runs_needing_to_advance = np.zeros(len(persis_info[lw]['run_order']),dtype=bool) + for run,inds in enumerate(persis_info[lw]['run_order'].values()): + runs_needing_to_advance[run] = np.all(H['returned'][inds]) + + if not np.any(runs_needing_to_advance): + break + + persis_info['last_size'] = len(H) # Give gen work persis_info['total_gen_calls'] += 1 gen_count += 1 - gen_work(Work, i, gen_specs['in'], persis_info[i], range(len(H))) + gen_work(Work, i, gen_specs['in'], persis_info[lw], range(len(H))) + + persis_info['last_worker'] = i return Work, persis_info diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py index 86bf3f55f..e445eac5a 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py @@ -66,7 +66,6 @@ 'localopt_method': 'LD_MMA', 'rk_const': 0.5*((gamma(1+(n/2))*5)**(1/n))/sqrt(pi), 'xtol_rel': 1e-2, - 'batch_mode': True, 'num_active_gens':1, } @@ -82,12 +81,23 @@ persis_info = {'next_to_give':0} persis_info['total_gen_calls'] = 0 + persis_info['last_worker'] = 0 + persis_info[0] = {'active_runs': set(), + 'run_order': {}, + 'old_runs': {}, + 'total_runs': 0, + 'rand_stream': np.random.RandomState(1)} - for i in range(MPI.COMM_WORLD.Get_size()): + # Making persis_info fields to store APOSMM information, but will be passed + # to various workers. + + for i in range(1,MPI.COMM_WORLD.Get_size()): persis_info[i] = {'rand_stream': np.random.RandomState(i)} if run == 1: - # Change the bounds to put a local min at a corner point (to test that APOSMM handles the same point being in multiple runs) ability to give back a previously evaluated point) + # Change the bounds to put a local min at a corner point (to test that + # APOSMM handles the same point being in multiple runs) ability to + # give back a previously evaluated point) gen_specs['ub']= np.array([-2.9, -1.9]) gen_specs['mu']= 1e-4 gen_specs['rk_const']= 0.01*((gamma(1+(n/2))*5)**(1/n))/sqrt(pi) diff --git a/libensemble/tests/regression_tests/test_branin_aposmm.py b/libensemble/tests/regression_tests/test_branin_aposmm.py index cdb7a9fb4..e9b197b17 100644 --- a/libensemble/tests/regression_tests/test_branin_aposmm.py +++ b/libensemble/tests/regression_tests/test_branin_aposmm.py @@ -88,6 +88,12 @@ persis_info = {} for i in range(MPI.COMM_WORLD.Get_size()): persis_info[i] = {'rand_stream': np.random.RandomState(i)} + +persis_info[1]['total_runs'] = 0 +persis_info[1]['active_runs'] = set() +persis_info[1]['run_order'] = {} +persis_info[1]['old_runs'] = {} +persis_info[1]['total_runs'] = 0 # Perform the run if __name__ == "__main__": diff --git a/libensemble/tests/regression_tests/test_chwirut_aposmm_one_residual_at_a_time.py b/libensemble/tests/regression_tests/test_chwirut_aposmm_one_residual_at_a_time.py index 8dea876bc..93bdd6b0f 100644 --- a/libensemble/tests/regression_tests/test_chwirut_aposmm_one_residual_at_a_time.py +++ b/libensemble/tests/regression_tests/test_chwirut_aposmm_one_residual_at_a_time.py @@ -97,6 +97,13 @@ for i in range(MPI.COMM_WORLD.Get_size()): persis_info[i] = {'rand_stream': np.random.RandomState(i)} + +persis_info['last_worker'] = 0 +persis_info[0] = {'active_runs': set(), + 'run_order': {}, + 'old_runs': {}, + 'total_runs': 0, + 'rand_stream': np.random.RandomState(1)} # Perform the run H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) diff --git a/libensemble/tests/regression_tests/test_chwirut_pounders.py b/libensemble/tests/regression_tests/test_chwirut_pounders.py index 219532513..2077b91ae 100644 --- a/libensemble/tests/regression_tests/test_chwirut_pounders.py +++ b/libensemble/tests/regression_tests/test_chwirut_pounders.py @@ -80,8 +80,15 @@ persis_info['has_nan'] = set() persis_info['already_paused'] = set() persis_info['H_len'] = 0 + for i in range(MPI.COMM_WORLD.Get_size()): persis_info[i] = {'rand_stream': np.random.RandomState(i)} + +persis_info[1] = {'active_runs': set(), + 'run_order': {}, + 'old_runs': {}, + 'total_runs': 0, + 'rand_stream': np.random.RandomState(1)} # Perform the run H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs) diff --git a/libensemble/tests/regression_tests/test_chwirut_uniform_sampling_one_residual_at_a_time.py b/libensemble/tests/regression_tests/test_chwirut_uniform_sampling_one_residual_at_a_time.py index b3c17a0ff..effa927f0 100644 --- a/libensemble/tests/regression_tests/test_chwirut_uniform_sampling_one_residual_at_a_time.py +++ b/libensemble/tests/regression_tests/test_chwirut_uniform_sampling_one_residual_at_a_time.py @@ -78,6 +78,13 @@ for i in range(MPI.COMM_WORLD.Get_size()): persis_info[i] = {'rand_stream': np.random.RandomState(i)} + +persis_info['last_worker'] = 0 +persis_info[0] = {'active_runs': set(), + 'run_order': {}, + 'old_runs': {}, + 'total_runs': 0, + 'rand_stream': np.random.RandomState(1)} # Perform the run H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) From 34e616327e30e0402181952fccdf12bc0ea13a6d Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 9 Oct 2018 16:28:43 -0500 Subject: [PATCH 074/101] Speeding up the second part of this test. --- .../tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py index e445eac5a..6eaf810aa 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py @@ -107,6 +107,7 @@ gen_specs['ftol_rel'] = 1e-2 gen_specs['xtol_abs'] = 1e-3 gen_specs['ftol_abs'] = 1e-8 + exit_criteria = {'sim_max': 200} H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs) From 72a3fc0bd7d2cb23117d09ecce79f6c2dfd35ff9 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 9 Oct 2018 16:36:48 -0500 Subject: [PATCH 075/101] Adding max_active_runs to another test for better coverage --- .../tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py index 6eaf810aa..37eae13e6 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py @@ -67,6 +67,7 @@ 'rk_const': 0.5*((gamma(1+(n/2))*5)**(1/n))/sqrt(pi), 'xtol_rel': 1e-2, 'num_active_gens':1, + 'max_active_runs':6, } From a67ca5a68a914acba2914c91d6568b3d4bde78f4 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 11 Oct 2018 13:07:45 -0500 Subject: [PATCH 076/101] Added assert to guard against all resources idle and no work assigned. --- libensemble/libE_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libensemble/libE_manager.py b/libensemble/libE_manager.py index cbf326046..dc3050f90 100644 --- a/libensemble/libE_manager.py +++ b/libensemble/libE_manager.py @@ -387,6 +387,8 @@ def run(self, persis_info): self._check_work_order(Work[w], w) self._send_work_order(Work[w], w) self._update_state_on_alloc(Work[w], w) + assert self.term_test() or any(self.W['active'] != 0), \ + "Should not wait for workers when all workers are idle." # Return persis_info, exit_flag return self._final_receive_and_kill(persis_info) From 939fcf412f24ef435b10e48b082c52c806c6615e Mon Sep 17 00:00:00 2001 From: David Bindel Date: Thu, 11 Oct 2018 14:50:52 -0500 Subject: [PATCH 077/101] Added timeout checks for all tests (make sure we get some useful output on hang) --- .../test_6-hump_camel_active_persistent_worker_abort.py | 2 +- .../tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py | 3 ++- .../test_6-hump_camel_persistent_uniform_sampling.py | 2 +- .../regression_tests/test_6-hump_camel_uniform_sampling.py | 3 ++- ...ump_camel_uniform_sampling_with_persistent_localopt_gens.py | 3 ++- .../test_6-hump_camel_with_different_nodes_uniform_sample.py | 2 +- .../test_chwirut_aposmm_one_residual_at_a_time.py | 2 ++ libensemble/tests/regression_tests/test_chwirut_pounders.py | 2 ++ .../test_chwirut_uniform_sampling_one_residual_at_a_time.py | 1 + libensemble/tests/regression_tests/test_comms.py | 3 ++- libensemble/tests/regression_tests/test_fast_alloc.py | 3 ++- .../tests/regression_tests/test_inverse_bayes_example.py | 3 ++- libensemble/tests/regression_tests/test_nan_func_aposmm.py | 3 ++- 13 files changed, 22 insertions(+), 10 deletions(-) diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_active_persistent_worker_abort.py b/libensemble/tests/regression_tests/test_6-hump_camel_active_persistent_worker_abort.py index 3b770ae79..fd64da9b0 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_active_persistent_worker_abort.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_active_persistent_worker_abort.py @@ -68,7 +68,7 @@ # Tell libEnsemble when to stop -exit_criteria = {'sim_max': 10} # Intentially set low so as to test that a worker in persistent mode can be terminated correctly +exit_criteria = {'sim_max': 10, 'elapsed_wallclock_time': 300} # Intentially set low so as to test that a worker in persistent mode can be terminated correctly np.random.seed(1) persis_info = {} diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py index 37eae13e6..a89ed1363 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py @@ -108,11 +108,12 @@ gen_specs['ftol_rel'] = 1e-2 gen_specs['xtol_abs'] = 1e-3 gen_specs['ftol_abs'] = 1e-8 - exit_criteria = {'sim_max': 200} + exit_criteria = {'sim_max': 200, 'elapsed_wallclock_time': 300} H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs) if MPI.COMM_WORLD.Get_rank() == 0: + assert flag == 0 short_name = script_name.split("test_", 1).pop() filename = short_name + '_results_History_length=' + str(len(H)) + '_evals=' + str(sum(H['returned'])) + '_ranks=' + str(MPI.COMM_WORLD.Get_size()) print("\n\n\nRun completed.\nSaving results to file: " + filename) diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_persistent_uniform_sampling.py b/libensemble/tests/regression_tests/test_6-hump_camel_persistent_uniform_sampling.py index c886c5f35..c9a93c4d5 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_persistent_uniform_sampling.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_persistent_uniform_sampling.py @@ -44,7 +44,7 @@ } # Tell libEnsemble when to stop -exit_criteria = {'sim_max': 40} +exit_criteria = {'sim_max': 40, 'elapsed_wallclock_time': 300} np.random.seed(1) persis_info = {} diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_uniform_sampling.py b/libensemble/tests/regression_tests/test_6-hump_camel_uniform_sampling.py index c4d164d61..cb5373785 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_uniform_sampling.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_uniform_sampling.py @@ -44,7 +44,7 @@ } # Tell libEnsemble when to stop -exit_criteria = {'gen_max': 501} +exit_criteria = {'gen_max': 501, 'elapsed_wallclock_time': 300} np.random.seed(1) persis_info = {} @@ -55,6 +55,7 @@ H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info) if MPI.COMM_WORLD.Get_rank() == 0: + assert flag == 0 short_name = script_name.split("test_", 1).pop() filename = short_name + '_results_History_length=' + str(len(H)) + '_evals=' + str(sum(H['returned'])) + '_ranks=' + str(MPI.COMM_WORLD.Get_size()) print("\n\n\nRun completed.\nSaving results to file: " + filename) diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_uniform_sampling_with_persistent_localopt_gens.py b/libensemble/tests/regression_tests/test_6-hump_camel_uniform_sampling_with_persistent_localopt_gens.py index b3d2fa514..96257637a 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_uniform_sampling_with_persistent_localopt_gens.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_uniform_sampling_with_persistent_localopt_gens.py @@ -79,7 +79,7 @@ ] # Tell libEnsemble when to stop -exit_criteria = {'sim_max': 1000} +exit_criteria = {'sim_max': 1000, 'elapsed_wallclock_time': 300} np.random.seed(1) persis_info = {} @@ -94,6 +94,7 @@ H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs) if MPI.COMM_WORLD.Get_rank() == 0: + assert flag == 0 short_name = script_name.split("test_", 1).pop() filename = short_name + '_results_History_length=' + str(len(H)) + '_evals=' + str(sum(H['returned'])) + '_ranks=' + str(MPI.COMM_WORLD.Get_size()) print("\n\n\nRun completed.\nSaving results to file: " + filename) diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_with_different_nodes_uniform_sample.py b/libensemble/tests/regression_tests/test_6-hump_camel_with_different_nodes_uniform_sample.py index 5a2ed9856..e55ea5650 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_with_different_nodes_uniform_sample.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_with_different_nodes_uniform_sample.py @@ -68,7 +68,7 @@ } # Tell libEnsemble when to stop -exit_criteria = {'sim_max': 10} +exit_criteria = {'sim_max': 10, 'elapsed_wallclock_time': 300} np.random.seed(1) persis_info = {} diff --git a/libensemble/tests/regression_tests/test_chwirut_aposmm_one_residual_at_a_time.py b/libensemble/tests/regression_tests/test_chwirut_aposmm_one_residual_at_a_time.py index 93bdd6b0f..cd5268a4d 100644 --- a/libensemble/tests/regression_tests/test_chwirut_aposmm_one_residual_at_a_time.py +++ b/libensemble/tests/regression_tests/test_chwirut_aposmm_one_residual_at_a_time.py @@ -83,6 +83,7 @@ gen_specs['sample_points'] = np.random.uniform(0,1,(max_sim_budget,n))*(gen_specs['ub']-gen_specs['lb'])+gen_specs['lb'] exit_criteria = {'sim_max': max_sim_budget, # must be provided + 'elapsed_wallclock_time': 300 } alloc_specs = {'out':[('allocated',bool)], 'alloc_f':alloc_f} @@ -108,6 +109,7 @@ H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) if MPI.COMM_WORLD.Get_rank() == 0: + assert flag == 0 assert len(H) >= max_sim_budget short_name = script_name.split("test_", 1).pop() filename = short_name + '_results_after_evals=' + str(max_sim_budget) + '_ranks=' + str(MPI.COMM_WORLD.Get_size()) diff --git a/libensemble/tests/regression_tests/test_chwirut_pounders.py b/libensemble/tests/regression_tests/test_chwirut_pounders.py index 2077b91ae..7b4200b7e 100644 --- a/libensemble/tests/regression_tests/test_chwirut_pounders.py +++ b/libensemble/tests/regression_tests/test_chwirut_pounders.py @@ -71,6 +71,7 @@ } exit_criteria = {'sim_max': max_sim_budget, # must be provided + 'elapsed_wallclock_time': 300 } libE_specs = {'queue_update_function': queue_update_function} @@ -93,6 +94,7 @@ H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs) if MPI.COMM_WORLD.Get_rank() == 0: + assert flag == 0 assert len(H) >= max_sim_budget short_name = script_name.split("test_", 1).pop() filename = short_name + '_results_after_evals=' + str(max_sim_budget) + '_ranks=' + str(MPI.COMM_WORLD.Get_size()) diff --git a/libensemble/tests/regression_tests/test_chwirut_uniform_sampling_one_residual_at_a_time.py b/libensemble/tests/regression_tests/test_chwirut_uniform_sampling_one_residual_at_a_time.py index effa927f0..8862a0959 100644 --- a/libensemble/tests/regression_tests/test_chwirut_uniform_sampling_one_residual_at_a_time.py +++ b/libensemble/tests/regression_tests/test_chwirut_uniform_sampling_one_residual_at_a_time.py @@ -63,6 +63,7 @@ } exit_criteria = {'sim_max': max_sim_budget, # must be provided + 'elapsed_wallclock_time': 300 } alloc_specs = {'out':[('allocated',bool)], 'alloc_f':alloc_f} diff --git a/libensemble/tests/regression_tests/test_comms.py b/libensemble/tests/regression_tests/test_comms.py index 675a4c3c6..7b020abfa 100644 --- a/libensemble/tests/regression_tests/test_comms.py +++ b/libensemble/tests/regression_tests/test_comms.py @@ -68,7 +68,7 @@ } #sim_max = num_workers -exit_criteria = {'sim_max': sim_max} +exit_criteria = {'sim_max': sim_max, 'elapsed_wallclock_time': 300} np.random.seed(1) @@ -81,6 +81,7 @@ if MPI.COMM_WORLD.Get_rank() == 0: + assert flag == 0 #import pdb; pdb.set_trace() for w in range(1, num_workers+1): x = w * 1000.0 diff --git a/libensemble/tests/regression_tests/test_fast_alloc.py b/libensemble/tests/regression_tests/test_fast_alloc.py index 201730662..5b69a2135 100644 --- a/libensemble/tests/regression_tests/test_fast_alloc.py +++ b/libensemble/tests/regression_tests/test_fast_alloc.py @@ -56,7 +56,7 @@ gen_specs['gen_batch_size'] = num_pts//2 # Tell libEnsemble when to stop - exit_criteria = {'sim_max': num_pts} + exit_criteria = {'sim_max': num_pts, 'elapsed_walclock_time': 300} np.random.seed(1) persis_info = {'next_to_give':0} @@ -70,4 +70,5 @@ H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs) if MPI.COMM_WORLD.Get_rank() == 0: + assert flag == 0 assert len(H) == num_pts diff --git a/libensemble/tests/regression_tests/test_inverse_bayes_example.py b/libensemble/tests/regression_tests/test_inverse_bayes_example.py index 27218fe8f..c341f3189 100644 --- a/libensemble/tests/regression_tests/test_inverse_bayes_example.py +++ b/libensemble/tests/regression_tests/test_inverse_bayes_example.py @@ -48,7 +48,7 @@ } # Tell libEnsemble when to stop -exit_criteria = {'sim_max': gen_specs['subbatch_size']*gen_specs['num_subbatches']*gen_specs['num_batches']} +exit_criteria = {'sim_max': gen_specs['subbatch_size']*gen_specs['num_subbatches']*gen_specs['num_batches'], 'elapsed_wallclock_time': 300} np.random.seed(1) persis_info = {} @@ -66,6 +66,7 @@ if MPI.COMM_WORLD.Get_rank() == 0: + assert flag == 0 # Change the last weights to correct values (H is a list on other cores and only array on manager) ind = 2*gen_specs['subbatch_size']*gen_specs['num_subbatches'] H[-ind:] = H['prior'][-ind:] + H['like'][-ind:] - H['prop'][-ind:] diff --git a/libensemble/tests/regression_tests/test_nan_func_aposmm.py b/libensemble/tests/regression_tests/test_nan_func_aposmm.py index d1547fef4..8567bd75a 100644 --- a/libensemble/tests/regression_tests/test_nan_func_aposmm.py +++ b/libensemble/tests/regression_tests/test_nan_func_aposmm.py @@ -76,7 +76,7 @@ def nan_func(calc_in,persis_info,sim_specs,libE_info): gen_specs['combine_component_func'] = np.linalg.norm # Tell libEnsemble when to stop -exit_criteria = {'sim_max': 100} +exit_criteria = {'sim_max': 100, 'elapsed_wallclock_time': 300} np.random.seed(1) persis_info = {} @@ -86,6 +86,7 @@ def nan_func(calc_in,persis_info,sim_specs,libE_info): # Perform the run H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info) if MPI.COMM_WORLD.Get_rank() == 0: + assert flag == 0 short_name = script_name.split("test_", 1).pop() filename = short_name + '_results_History_length=' + str(len(H)) + '_evals=' + str(sum(H['returned'])) + '_ranks=' + str(MPI.COMM_WORLD.Get_size()) print("\n\n\nRun completed.\nSaving results to file: " + filename) From 067e608293aa88e02d9402a01058eff211be9d45 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Fri, 12 Oct 2018 08:34:03 -0500 Subject: [PATCH 078/101] Fixing broken APOSMM to restart runs when 'max_active_runs' is no longer binding. --- libensemble/gen_funcs/aposmm.py | 13 +++++---- .../test_6-hump_camel_aposmm_LD_MMA.py | 27 +++++++++++-------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index f05e5aace..a94500cb0 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -71,7 +71,7 @@ def aposmm_logic(H,persis_info,gen_specs,_): Optional ``gen_specs`` entries are: - - ``'sample_points' [int]``: The points to be sampled (in the original domain) + - ``'sample_points' [numpy array]``: The points to be sampled (in the original domain) - ``'combine_component_func' [func]``: Function to combine objective components - ``'components' [int]``: Number of objective components - ``'dist_to_bound_multiple' [float in (0,1]]``: What fraction of the distance to the nearest boundary should the initial step size be in localopt runs @@ -177,6 +177,8 @@ def aposmm_logic(H,persis_info,gen_specs,_): k_sorted = np.argpartition(-dist_to_better,kth=gen_specs['max_active_runs']-1) # Take max_active_runs largest persis_info['active_runs'] = set(run_vals[k_sorted[:gen_specs['max_active_runs']],0].astype(int)) + else: + persis_info['active_runs'] = set(persis_info['run_order'].keys()) inactive_runs = set() @@ -216,13 +218,14 @@ def aposmm_logic(H,persis_info,gen_specs,_): if samples_needed > 0: if 'sample_points' in gen_specs: - v = sum(H['local_pt']) + v = sum(~H['local_pt']) # Number of sample points so far x_new = gen_specs['sample_points'][v:v+samples_needed] on_cube = False # We assume the points are on the original domain, not unit cube - else: - x_new = persis_info['rand_stream'].uniform(0,1,(samples_needed,n)) - on_cube = True + persis_info = add_points_to_O(O, x_new, H, gen_specs, c_flag, persis_info, on_cube=on_cube) + samples_needed = samples_needed - len(x_new) + x_new = persis_info['rand_stream'].uniform(0,1,(samples_needed,n)) + on_cube = True persis_info = add_points_to_O(O, x_new, H, gen_specs, c_flag, persis_info, on_cube=on_cube) O = np.append(H[np.array(list(updated_inds),dtype=int)][[o[0] for o in gen_specs['out']]],O) diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py index 37eae13e6..2e95cb213 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py @@ -56,6 +56,18 @@ ('pt_id',int), # To be used by APOSMM to identify points evaluated by different simulations ] + +# The minima are known on this test problem. +# 1) We use their values to test APOSMM has identified all minima +# 2) We use their approximate values to ensure APOSMM evaluates a point in each +# minima's basin of attraction. +minima = np.array([[ -0.089842, 0.712656], + [ 0.089842, -0.712656], + [ -1.70361, 0.796084], + [ 1.70361, -0.796084], + [ -1.6071, -0.568651], + [ 1.6071, 0.568651]]) + # State the generating function, its arguments, output, and necessary parameters. gen_specs = {'gen_f': aposmm_logic, 'in': [o[0] for o in gen_out] + ['f', 'grad', 'returned'], @@ -63,16 +75,17 @@ 'lb': np.array([-3,-2]), 'ub': np.array([ 3, 2]), 'initial_sample_size': 100, + 'sample_points': np.round(minima,2), 'localopt_method': 'LD_MMA', 'rk_const': 0.5*((gamma(1+(n/2))*5)**(1/n))/sqrt(pi), - 'xtol_rel': 1e-2, + 'xtol_rel': 1e-3, 'num_active_gens':1, 'max_active_runs':6, } # Tell libEnsemble when to stop -exit_criteria = {'sim_max': 1000} +exit_criteria = {'sim_max': 300} alloc_specs = {'out':[('allocated',bool)], 'alloc_f':alloc_f} @@ -109,6 +122,7 @@ gen_specs['xtol_abs'] = 1e-3 gen_specs['ftol_abs'] = 1e-8 exit_criteria = {'sim_max': 200} + minima = np.array([[-2.9, -1.9]]) H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs) @@ -118,15 +132,6 @@ print("\n\n\nRun completed.\nSaving results to file: " + filename) np.save(filename, H) - if run == 0: - minima = np.array([[ -0.089842, 0.712656], - [ 0.089842, -0.712656], - [ -1.70361, 0.796084], - [ 1.70361, -0.796084], - [ -1.6071, -0.568651], - [ 1.6071, 0.568651]]) - else: - minima = np.array([[-2.9, -1.9]]) tol = 1e-4 for m in minima: From 424287369b192c69831d6d5665c8a9245832d8f0 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Fri, 12 Oct 2018 09:20:23 -0500 Subject: [PATCH 079/101] Fixing APOSMM to make the correct samplesize when there aren't enough points in gen_specs['sample_points'] --- libensemble/gen_funcs/aposmm.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index a94500cb0..72bc6b761 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -216,14 +216,14 @@ def aposmm_logic(H,persis_info,gen_specs,_): else: samples_needed = int(not bool(len(O))) # 1 if len(O)==0, 0 otherwise - if samples_needed > 0: - if 'sample_points' in gen_specs: - v = sum(~H['local_pt']) # Number of sample points so far - x_new = gen_specs['sample_points'][v:v+samples_needed] - on_cube = False # We assume the points are on the original domain, not unit cube - persis_info = add_points_to_O(O, x_new, H, gen_specs, c_flag, persis_info, on_cube=on_cube) - samples_needed = samples_needed - len(x_new) + if samples_needed > 0 and 'sample_points' in gen_specs: + v = sum(~H['local_pt']) # Number of sample points so far + x_new = gen_specs['sample_points'][v:v+samples_needed] + on_cube = False # We assume the points are on the original domain, not unit cube + persis_info = add_points_to_O(O, x_new, H, gen_specs, c_flag, persis_info, on_cube=on_cube) + samples_needed = samples_needed - len(x_new) + if samples_needed > 0: x_new = persis_info['rand_stream'].uniform(0,1,(samples_needed,n)) on_cube = True persis_info = add_points_to_O(O, x_new, H, gen_specs, c_flag, persis_info, on_cube=on_cube) From 517b6ddaef28ce3d8bb9037d0903264ee0def875 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Mon, 15 Oct 2018 09:12:06 -0500 Subject: [PATCH 080/101] Tightening convergence for regression test --- .../test_6-hump_camel_aposmm_LD_MMA.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py index 2e95cb213..43ee28ce2 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py @@ -75,7 +75,7 @@ 'lb': np.array([-3,-2]), 'ub': np.array([ 3, 2]), 'initial_sample_size': 100, - 'sample_points': np.round(minima,2), + 'sample_points': np.round(minima,1), 'localopt_method': 'LD_MMA', 'rk_const': 0.5*((gamma(1+(n/2))*5)**(1/n))/sqrt(pi), 'xtol_rel': 1e-3, @@ -85,7 +85,7 @@ # Tell libEnsemble when to stop -exit_criteria = {'sim_max': 300} +exit_criteria = {'sim_max': 400} alloc_specs = {'out':[('allocated',bool)], 'alloc_f':alloc_f} @@ -132,11 +132,10 @@ print("\n\n\nRun completed.\nSaving results to file: " + filename) np.save(filename, H) - - tol = 1e-4 + tol = 1e-5 for m in minima: - print(np.min(np.sum((H['x']-m)**2,1))) - assert np.min(np.sum((H['x']-m)**2,1)) < tol + print(np.min(np.sum((H[H['local_min']]['x']-m)**2,1))) + assert np.min(np.sum((H[H['local_min']]['x']-m)**2,1)) < tol print("\nlibEnsemble with APOSMM using a gradient-based localopt method has identified the " + str(np.shape(minima)[0]) + " minima within a tolerance " + str(tol)) From d4ac8c8109b99b51531681b5ec1501bc6018a70c Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Mon, 15 Oct 2018 09:15:42 -0500 Subject: [PATCH 081/101] Fixing aposmm sample pts --- libensemble/gen_funcs/aposmm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libensemble/gen_funcs/aposmm.py b/libensemble/gen_funcs/aposmm.py index 72bc6b761..7764bccf8 100644 --- a/libensemble/gen_funcs/aposmm.py +++ b/libensemble/gen_funcs/aposmm.py @@ -220,7 +220,8 @@ def aposmm_logic(H,persis_info,gen_specs,_): v = sum(~H['local_pt']) # Number of sample points so far x_new = gen_specs['sample_points'][v:v+samples_needed] on_cube = False # We assume the points are on the original domain, not unit cube - persis_info = add_points_to_O(O, x_new, H, gen_specs, c_flag, persis_info, on_cube=on_cube) + if len(x_new): + persis_info = add_points_to_O(O, x_new, H, gen_specs, c_flag, persis_info, on_cube=on_cube) samples_needed = samples_needed - len(x_new) if samples_needed > 0: From 7da9d73eb0c28c3fc7032811722efe3494252a9d Mon Sep 17 00:00:00 2001 From: shudson Date: Mon, 15 Oct 2018 13:17:06 -0500 Subject: [PATCH 082/101] Send regression test output to screen --- libensemble/tests/run-tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/run-tests.sh b/libensemble/tests/run-tests.sh index c8e24d798..b82db5cd3 100755 --- a/libensemble/tests/run-tests.sh +++ b/libensemble/tests/run-tests.sh @@ -407,7 +407,7 @@ if [ "$root_found" = true ]; then mpiexec -np $NPROCS $MPIEXEC_FLAGS $PYTHON_RUN -m pytest $TEST_SCRIPT >> $TEST_SCRIPT.$NPROCS'procs'.$REG_TEST_OUTPUT_EXT 2>test.err test_code=$? else - mpiexec -np $NPROCS $MPIEXEC_FLAGS $PYTHON_RUN $COV_LINE_PARALLEL $TEST_SCRIPT >> $TEST_SCRIPT.$NPROCS'procs'.$REG_TEST_OUTPUT_EXT 2>test.err + mpiexec -np $NPROCS $MPIEXEC_FLAGS $PYTHON_RUN $COV_LINE_PARALLEL $TEST_SCRIPT #>> $TEST_SCRIPT.$NPROCS'procs'.$REG_TEST_OUTPUT_EXT 2>test.err test_code=$? fi reg_count_runs=$((reg_count_runs+1)) From 2fe5e05fcf61df4300bab64c7a60760a05ed8b14 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Mon, 15 Oct 2018 13:54:15 -0500 Subject: [PATCH 083/101] Trying to add a barrier to help travis fail --- .../tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py index 43ee28ce2..da2014e85 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py @@ -139,4 +139,6 @@ print("\nlibEnsemble with APOSMM using a gradient-based localopt method has identified the " + str(np.shape(minima)[0]) + " minima within a tolerance " + str(tol)) + MPI.COMM_WORLD.barrier() + From 12471a66d3370164306bba309fb07d5504f9ceed Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Mon, 15 Oct 2018 14:06:18 -0500 Subject: [PATCH 084/101] Removing barrier and assert and just aborting when a minimum isn't found --- .../regression_tests/test_6-hump_camel_aposmm_LD_MMA.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py index da2014e85..fc7a9f485 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py @@ -135,10 +135,11 @@ tol = 1e-5 for m in minima: print(np.min(np.sum((H[H['local_min']]['x']-m)**2,1))) - assert np.min(np.sum((H[H['local_min']]['x']-m)**2,1)) < tol + sys.stdout.flush() + if np.min(np.sum((H[H['local_min']]['x']-m)**2,1)) > tol: + MPI.COMM_WORLD.Abort() print("\nlibEnsemble with APOSMM using a gradient-based localopt method has identified the " + str(np.shape(minima)[0]) + " minima within a tolerance " + str(tol)) - MPI.COMM_WORLD.barrier() From c3ea6cca69a3476f01e44c79947c6f661870c9dc Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Mon, 15 Oct 2018 14:44:27 -0500 Subject: [PATCH 085/101] Trying again and ensuring correct Abort number --- .../regression_tests/test_6-hump_camel_aposmm_LD_MMA.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py index fc7a9f485..6c17704ee 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py @@ -137,9 +137,6 @@ print(np.min(np.sum((H[H['local_min']]['x']-m)**2,1))) sys.stdout.flush() if np.min(np.sum((H[H['local_min']]['x']-m)**2,1)) > tol: - MPI.COMM_WORLD.Abort() + MPI.COMM_WORLD.Abort(1) print("\nlibEnsemble with APOSMM using a gradient-based localopt method has identified the " + str(np.shape(minima)[0]) + " minima within a tolerance " + str(tol)) - - - From 8d162ebd4156fbfd97b6493c5b8b1205b92d6ec5 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Mon, 15 Oct 2018 14:45:59 -0500 Subject: [PATCH 086/101] Increasing function evaluations to hopefully avoid this issue in the future. --- .../tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py index 6c17704ee..5f6835fa7 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py @@ -85,7 +85,7 @@ # Tell libEnsemble when to stop -exit_criteria = {'sim_max': 400} +exit_criteria = {'sim_max': 1000} alloc_specs = {'out':[('allocated',bool)], 'alloc_f':alloc_f} From c375572216de5e12a6c619b98561b666d99f13a6 Mon Sep 17 00:00:00 2001 From: David Bindel Date: Tue, 16 Oct 2018 10:55:50 -0500 Subject: [PATCH 087/101] Added test for zero points returned when no worker active. --- libensemble/libE_manager.py | 2 ++ .../regression_tests/test_6-hump_camel_aposmm_LD_MMA.py | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/libensemble/libE_manager.py b/libensemble/libE_manager.py index dc3050f90..be35a40b8 100644 --- a/libensemble/libE_manager.py +++ b/libensemble/libE_manager.py @@ -284,6 +284,8 @@ def _update_state_on_worker_msg(self, persis_info, D_recv, w): self.hist.update_history_f(D_recv) if calc_type == EVAL_GEN_TAG: self.hist.update_history_x_in(w, D_recv['calc_out']) + assert len(D_recv['calc_out']) or np.any(self.W['active']), \ + "Gen must return work when is is the only thing active." if 'libE_info' in D_recv and 'persistent' in D_recv['libE_info']: # Now a waiting, persistent worker self.W[w-1]['persis_state'] = calc_type diff --git a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py index 709b11443..98b531f3a 100644 --- a/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py +++ b/libensemble/tests/regression_tests/test_6-hump_camel_aposmm_LD_MMA.py @@ -127,7 +127,12 @@ H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs) if MPI.COMM_WORLD.Get_rank() == 0: - assert flag == 0 + + if flag != 0: + print("Exit was not on convergence (code {})".format(flag)) + sys.stdout.flush() + MPI.COMM_WORLD.Abort(1) + short_name = script_name.split("test_", 1).pop() filename = short_name + '_results_History_length=' + str(len(H)) + '_evals=' + str(sum(H['returned'])) + '_ranks=' + str(MPI.COMM_WORLD.Get_size()) print("\n\n\nRun completed.\nSaving results to file: " + filename) From f4c71e9a6029c306c54eca04fb53ef74ce71682c Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 6 Nov 2018 15:50:20 -0600 Subject: [PATCH 088/101] Removing warning from coveragerc; ensuring exit_criteria are right --- libensemble/libE.py | 2 +- libensemble/tests/regression_tests/.coveragerc | 3 --- libensemble/tests/regression_tests/test_fast_alloc.py | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/libensemble/libE.py b/libensemble/libE.py index 42c28b30a..b5477d00d 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -196,7 +196,7 @@ def check_inputs(libE_specs, alloc_specs, sim_specs, gen_specs, exit_criteria, H assert len(exit_criteria) > 0, "Must have some exit criterion" valid_term_fields = ['sim_max', 'gen_max', 'elapsed_wallclock_time', 'stop_val'] - assert any([term_field in exit_criteria for term_field in valid_term_fields]), "Must have a valid termination option: " + str(valid_term_fields) + assert all([term_field in valid_term_fields for term_field in exit_criteria]), "Valid termination options: " + str(valid_term_fields) assert len(sim_specs['out']), "sim_specs must have 'out' entries" assert len(gen_specs['out']), "gen_specs must have 'out' entries" diff --git a/libensemble/tests/regression_tests/.coveragerc b/libensemble/tests/regression_tests/.coveragerc index 3e3860818..0fdb97c37 100644 --- a/libensemble/tests/regression_tests/.coveragerc +++ b/libensemble/tests/regression_tests/.coveragerc @@ -13,9 +13,6 @@ data_file = .cov_reg_out parallel = true -include = *.py - *.c - [report] omit = */__init__.py/* diff --git a/libensemble/tests/regression_tests/test_fast_alloc.py b/libensemble/tests/regression_tests/test_fast_alloc.py index 5b69a2135..3b591ff90 100644 --- a/libensemble/tests/regression_tests/test_fast_alloc.py +++ b/libensemble/tests/regression_tests/test_fast_alloc.py @@ -56,7 +56,7 @@ gen_specs['gen_batch_size'] = num_pts//2 # Tell libEnsemble when to stop - exit_criteria = {'sim_max': num_pts, 'elapsed_walclock_time': 300} + exit_criteria = {'sim_max': num_pts, 'elapsed_wallclock_time': 300} np.random.seed(1) persis_info = {'next_to_give':0} From 55cac1297949b60227deb85d7bcd661755a59476 Mon Sep 17 00:00:00 2001 From: shudson Date: Sat, 3 Nov 2018 13:42:42 -0500 Subject: [PATCH 089/101] Add central_mode flag to job controllers --- libensemble/balsam_controller.py | 9 +++++++-- libensemble/libE_worker.py | 2 ++ libensemble/mpi_controller.py | 9 ++++++++- libensemble/resources.py | 4 +++- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py index 5649abb65..1feba63ec 100644 --- a/libensemble/balsam_controller.py +++ b/libensemble/balsam_controller.py @@ -122,14 +122,19 @@ class BalsamJobController(MPIJobController): .. note:: Job kills are not configurable in the Balsam job_controller. """ - def __init__(self, auto_resources=True, + def __init__(self, auto_resources=True, central_mode=True, nodelist_env_slurm=None, nodelist_env_cobalt=None): """Instantiate a new BalsamJobController instance. A new BalsamJobController object is created with an application registry and configuration attributes """ - super().__init__(auto_resources, + + if not central_mode: + logger.warning("Balsam does not currently support distributed mode - running in central mode") + central_mode=True + + super().__init__(auto_resources, central_mode, nodelist_env_slurm, nodelist_env_cobalt) self.mpi_launcher = None if MPI.COMM_WORLD.Get_rank() == 0: diff --git a/libensemble/libE_worker.py b/libensemble/libE_worker.py index ea8cc9d6c..f15c780d9 100644 --- a/libensemble/libE_worker.py +++ b/libensemble/libE_worker.py @@ -250,6 +250,8 @@ def run(self, Work, calc_in): calc_stats.calc_type = calc_type try: + if calc_type == EVAL_GEN_TAG: + logger.debug("Running generator") calc = self._run_calc[calc_type] with calc_stats.timer: with self.loc_stack.loc(calc_type): diff --git a/libensemble/mpi_controller.py b/libensemble/mpi_controller.py index 03b338521..03f4d6990 100644 --- a/libensemble/mpi_controller.py +++ b/libensemble/mpi_controller.py @@ -21,7 +21,7 @@ class MPIJobController(JobController): """The MPI job_controller can create, poll and kill runnable MPI jobs """ - def __init__(self, auto_resources=True, + def __init__(self, auto_resources=True, central_mode=False, nodelist_env_slurm=None, nodelist_env_cobalt=None): """Instantiate a new JobController instance. @@ -39,6 +39,12 @@ def __init__(self, auto_resources=True, Auto-detect available processor resources and assign to jobs if not explicitly provided on launch. + central_mode, optional: boolean: + If true, then running in central mode, else distributed. + Central mode means libE processes (manager and workers) are grouped together and + do not share nodes with applications. Distributed mode means Workers share nodes + with applications. + nodelist_env_slurm: String, optional The environment variable giving a node list in Slurm format (Default: Uses SLURM_NODELIST). Note: This is only queried if @@ -56,6 +62,7 @@ def __init__(self, auto_resources=True, if self.auto_resources: self.resources = \ MPIResources(top_level_dir=self.top_level_dir, + central_mode=central_mode, nodelist_env_slurm=nodelist_env_slurm, nodelist_env_cobalt=nodelist_env_cobalt) diff --git a/libensemble/resources.py b/libensemble/resources.py index 6ed83e5d2..bacf5a5fd 100644 --- a/libensemble/resources.py +++ b/libensemble/resources.py @@ -81,7 +81,9 @@ def __init__(self, top_level_dir=None, workerID=None, central_mode=False, self.top_level_dir = top_level_dir or os.getcwd() self.central_mode = central_mode - + if self.central_mode: + logger.debug('Running in central mode') + # These presence of these env vars will be used to detect scheduler self.nodelist_env_slurm = nodelist_env_slurm or Resources.default_nodelist_env_slurm self.nodelist_env_cobalt = nodelist_env_cobalt or Resources.default_nodelist_env_cobalt From 43442a1d31c17e0f8974ba347a7a40d629bc88da Mon Sep 17 00:00:00 2001 From: shudson Date: Sat, 3 Nov 2018 13:44:29 -0500 Subject: [PATCH 090/101] Add -z option to test runner to print output to screen - add to travis --- .travis.yml | 2 +- libensemble/tests/run-tests.sh | 26 +++++++++++++++++++------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index 53e2bcd0d..05f7b1357 100644 --- a/.travis.yml +++ b/.travis.yml @@ -70,7 +70,7 @@ install: # Run test script: - - libensemble/tests/run-tests.sh + - libensemble/tests/run-tests.sh -z # Coverage after_success: diff --git a/libensemble/tests/run-tests.sh b/libensemble/tests/run-tests.sh index b82db5cd3..92d600a24 100755 --- a/libensemble/tests/run-tests.sh +++ b/libensemble/tests/run-tests.sh @@ -166,15 +166,17 @@ RUN_PREFIX=$script_name CLEAN_ONLY=false unset MPIEXEC_FLAGS PYTEST_SHOW_OUT_ERR=false +RTEST_SHOW_OUT_ERR=false usage() { echo -e "\nUsage:" - echo " $0 [-hcsur] [-p <2|3>] [-n ] [-a ]" 1>&2; + echo " $0 [-hcsurz] [-p <2|3>] [-n ] [-a ]" 1>&2; echo "" echo "Options:" echo " -h Show this help message and exit" echo " -c Clean up test directories and exit" - echo " -s Print stdout and stderr to screen when running pytest (unit tests)" + echo " -s Print stdout and stderr to screen when running pytest (unit tests)" + echo " -z Print stdout and stderr to screen when running regression tests (run without pytest)" echo " -u Run only the unit tests" echo " -r Run only the regression tests" echo " -p {version} Select a version of python. E.g. -p 2 will run with the python2 exe" @@ -185,7 +187,7 @@ usage() { exit 1 } -while getopts ":p:n:a:hcsur" opt; do +while getopts ":p:n:a:hcszur" opt; do case $opt in p) echo "Parameter supplied for Python version: $OPTARG" >&2 @@ -208,7 +210,10 @@ while getopts ":p:n:a:hcsur" opt; do echo "Will show stdout and stderr during pytest" PYTEST_SHOW_OUT_ERR=true ;; - u) + z) + echo "Will show stdout and stderr during regression tests" + RTEST_SHOW_OUT_ERR=true + ;; u) echo "Running only the unit tests" export RUN_REG_TESTS=false ;; @@ -407,8 +412,13 @@ if [ "$root_found" = true ]; then mpiexec -np $NPROCS $MPIEXEC_FLAGS $PYTHON_RUN -m pytest $TEST_SCRIPT >> $TEST_SCRIPT.$NPROCS'procs'.$REG_TEST_OUTPUT_EXT 2>test.err test_code=$? else - mpiexec -np $NPROCS $MPIEXEC_FLAGS $PYTHON_RUN $COV_LINE_PARALLEL $TEST_SCRIPT #>> $TEST_SCRIPT.$NPROCS'procs'.$REG_TEST_OUTPUT_EXT 2>test.err - test_code=$? + if [ "$RTEST_SHOW_OUT_ERR" = "true" ]; then + mpiexec -np $NPROCS $MPIEXEC_FLAGS $PYTHON_RUN $COV_LINE_PARALLEL $TEST_SCRIPT + test_code=$? + else + mpiexec -np $NPROCS $MPIEXEC_FLAGS $PYTHON_RUN $COV_LINE_PARALLEL $TEST_SCRIPT >> $TEST_SCRIPT.$NPROCS'procs'.$REG_TEST_OUTPUT_EXT 2>test.err + test_code=$? + fi fi reg_count_runs=$((reg_count_runs+1)) @@ -508,7 +518,9 @@ if [ "$root_found" = true ]; then else if [ $REG_STOP_ON_FAILURE != "true" ]; then echo -e "" - echo -e "\n..see error log at $REG_TEST_SUBDIR/log.err" + if [ "$RTEST_SHOW_OUT_ERR" != "true" ]; then + echo -e "\n..see error log at $REG_TEST_SUBDIR/log.err" + fi summ_line="$reg_fail failed, $reg_pass passed in $reg_time seconds" tput bold;tput setaf 1; print_summary_line $summ_line From 9577ee3a6cc3f69bc61e36fe5e56f8a332cbc894 Mon Sep 17 00:00:00 2001 From: shudson Date: Sat, 3 Nov 2018 13:50:13 -0500 Subject: [PATCH 091/101] Change of balsam models location --- libensemble/balsam_controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py index 1feba63ec..fbb67e020 100644 --- a/libensemble/balsam_controller.py +++ b/libensemble/balsam_controller.py @@ -14,7 +14,7 @@ from libensemble.mpi_controller import MPIJobController import balsam.launcher.dag as dag -from balsam.service import models +from balsam.core import models logger = logging.getLogger(__name__ + '(' + MPIResources.get_my_name() + ')') #For debug messages in this module - uncomment From a0e4155ef2ac0a9944fa73865cc8e2efd6763e3c Mon Sep 17 00:00:00 2001 From: shudson Date: Sat, 3 Nov 2018 13:55:53 -0500 Subject: [PATCH 092/101] Add python 3.7 using official workaround --- .travis.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 05f7b1357..863b8c5c5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,14 +1,15 @@ language: python +sudo: required +dist: xenial python: - 2.7 - 3.4 - 3.5 - 3.6 - #- 3.7 + - 3.7 os: linux dist: trusty -sudo: false env: global: From 62101ef3cb16c4daa983251a7bd838d6df1845d7 Mon Sep 17 00:00:00 2001 From: shudson Date: Sat, 3 Nov 2018 14:58:09 -0500 Subject: [PATCH 093/101] Fix run-tests.sh when test.err does not exist --- libensemble/tests/run-tests.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libensemble/tests/run-tests.sh b/libensemble/tests/run-tests.sh index 92d600a24..32fa5095d 100755 --- a/libensemble/tests/run-tests.sh +++ b/libensemble/tests/run-tests.sh @@ -431,7 +431,8 @@ if [ "$root_found" = true ]; then code=$test_code #sh - currently stop on failure if [ $REG_STOP_ON_FAILURE != "true" ]; then #Dump error to log file - echo -e "\nTest $test_num: $TEST_SCRIPT on $NPROCS processes:\n" >>log.err; cat test.err >>log.err + echo -e "\nTest $test_num: $TEST_SCRIPT on $NPROCS processes:\n" >>log.err + [ -e test.err ] && cat test.err >>log.err fi; reg_fail=$((reg_fail+1)) fi; From 5ede3690c9b92e5470e072f6a6a700ec7dc32dd6 Mon Sep 17 00:00:00 2001 From: shudson Date: Mon, 5 Nov 2018 10:39:26 -0600 Subject: [PATCH 094/101] Test if distribution results in test failure --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 863b8c5c5..d35dd2e15 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,12 +1,12 @@ language: python sudo: required -dist: xenial +#dist: xenial python: - 2.7 - 3.4 - 3.5 - 3.6 - - 3.7 + #- 3.7 os: linux dist: trusty From f956b934d3c1a13bd0c9a6707434f76fe309a61b Mon Sep 17 00:00:00 2001 From: shudson Date: Mon, 5 Nov 2018 14:42:37 -0600 Subject: [PATCH 095/101] Turn off auto-resources in job controller regression test --- .travis.yml | 5 ++--- .../tests/regression_tests/test_jobcontroller_hworld.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index d35dd2e15..42fd00fe7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,15 +1,14 @@ language: python sudo: required -#dist: xenial +dist: xenial python: - 2.7 - 3.4 - 3.5 - 3.6 - #- 3.7 + - 3.7 os: linux -dist: trusty env: global: diff --git a/libensemble/tests/regression_tests/test_jobcontroller_hworld.py b/libensemble/tests/regression_tests/test_jobcontroller_hworld.py index 16425b85f..30ea0b019 100644 --- a/libensemble/tests/regression_tests/test_jobcontroller_hworld.py +++ b/libensemble/tests/regression_tests/test_jobcontroller_hworld.py @@ -38,7 +38,7 @@ def build_simfunc(): jobctrl = BalsamJobController(auto_resources = True) else: from libensemble.mpi_controller import MPIJobController - jobctrl = MPIJobController(auto_resources = True) + jobctrl = MPIJobController(auto_resources = False) jobctrl.register_calc(full_path=sim_app, calc_type='sim') summary_file_name = short_name + '.libe_summary.txt' From 996a8fceea8396dd25b8dc667c6f93601f243c20 Mon Sep 17 00:00:00 2001 From: shudson Date: Wed, 7 Nov 2018 11:42:10 -0600 Subject: [PATCH 096/101] Update version and setup.py info --- libensemble/__init__.py | 4 ++-- setup.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/libensemble/__init__.py b/libensemble/__init__.py index f879cf40a..dc3a0c548 100644 --- a/libensemble/__init__.py +++ b/libensemble/__init__.py @@ -4,6 +4,6 @@ Library for managing ensemble-like collections of computations. """ -__version__ = "0.3.0" -__author__ = 'Jeffrey Larson and Stephen Hudson' +__version__ = "0.4.0" +__author__ = 'Jeffrey Larson, Stephen Hudson and David Bindel' __credits__ = 'Argonne National Laboratory' diff --git a/setup.py b/setup.py index 52d47b209..c54d66595 100644 --- a/setup.py +++ b/setup.py @@ -25,10 +25,10 @@ def run_tests(self): setup( name='libensemble', - version='0.3.0', + version='0.4.0', description='Library for managing ensemble-like collections of computations', url='https://github.com/Libensemble/libensemble', - author='Jeffrey Larson and Stephen Hudson and David Bindel', + author='Jeffrey Larson, Stephen Hudson and David Bindel', author_email='libensemble@lists.mcs.anl.gov', license='BSD 2-clause', @@ -51,7 +51,7 @@ def run_tests(self): tests_require=['pytest>=3.1', 'pytest-cov>=2.5', 'pytest-pep8>=1.0', - 'tox>=2.7' + 'pytest-timeout', ], classifiers=[ @@ -68,6 +68,7 @@ def run_tests(self): 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: Implementation :: CPython', 'Topic :: Scientific/Engineering', 'Topic :: Software Development :: Libraries :: Python Modules' From 21c6dcb45bdcdc0c21e0e1f62fed83ded75fce68 Mon Sep 17 00:00:00 2001 From: shudson Date: Wed, 7 Nov 2018 14:15:59 -0600 Subject: [PATCH 097/101] Update job controller documentation for v0.4.0 --- docs/conf.py | 1 + docs/job_controller/balsam_controller.rst | 25 ++++++++++++ docs/job_controller/jc_index.rst | 4 +- docs/job_controller/job_controller.rst | 49 ++++++++--------------- docs/job_controller/mpi_controller.rst | 17 ++++++++ docs/job_controller/overview.rst | 26 ++++++------ docs/job_controller/register.rst | 19 --------- libensemble/balsam_controller.py | 2 +- libensemble/controller.py | 17 +++++--- 9 files changed, 86 insertions(+), 74 deletions(-) create mode 100644 docs/job_controller/balsam_controller.rst create mode 100644 docs/job_controller/mpi_controller.rst delete mode 100644 docs/job_controller/register.rst diff --git a/docs/conf.py b/docs/conf.py index c9b17b4ef..c8ec95e5b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -66,6 +66,7 @@ def __getattr__(cls, name): ##breathe_projects_source = {"libEnsemble" : ( "../code/src/", ["libE.py", "test.cpp"] )} #breathe_projects_source = {"libEnsemble" : ( "../code/src/", ["test.cpp","test2.cpp"] )} +autodoc_mock_imports = ["balsam"] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/docs/job_controller/balsam_controller.rst b/docs/job_controller/balsam_controller.rst new file mode 100644 index 000000000..7068b1856 --- /dev/null +++ b/docs/job_controller/balsam_controller.rst @@ -0,0 +1,25 @@ +Balsam Job Controller +===================== + +To create a Balsam job controller, the calling script should contain:: + + jobctr = BalsamJobController() + +The Balsam job controller inherits from the MPI job controller. See the +:doc:`MPIJobController` for shared API. Any differences are +shown below. + +.. automodule:: balsam_controller + :no-undoc-members: + +.. autoclass:: BalsamJobController + :show-inheritance: +.. :inherited-members: +.. :member-order: bysource +.. :members: __init__, launch, poll, manager_poll, kill, set_kill_mode + +.. autoclass:: BalsamJob + :show-inheritance: + :member-order: bysource +.. :members: workdir_exists, file_exists_in_workdir, read_file_in_workdir, stdout_exists, read_stdout +.. :inherited-members: diff --git a/docs/job_controller/jc_index.rst b/docs/job_controller/jc_index.rst index 9d330f749..22e6da49b 100644 --- a/docs/job_controller/jc_index.rst +++ b/docs/job_controller/jc_index.rst @@ -4,9 +4,9 @@ Job Controller The job controller can be used with simulation functions to provide a simple, portable interface for running and managing user jobs. .. toctree:: - :maxdepth: 1 + :maxdepth: 2 + :titlesonly: :caption: libEnsemble Job Controller: overview - register job_controller diff --git a/docs/job_controller/job_controller.rst b/docs/job_controller/job_controller.rst index af7179145..91cf69557 100644 --- a/docs/job_controller/job_controller.rst +++ b/docs/job_controller/job_controller.rst @@ -3,46 +3,31 @@ Job Controller Module .. automodule:: controller :no-undoc-members: - + See :doc:`example` for usage. - -JobController Class -------------------- - -The JobController should be constructed after registering applications to a Registry:: - - jobctl = JobController(registry = registry) - -or if using Balsam:: - - jobctr = BalsamJobController(registry = registry) - -.. autoclass:: JobController - :member-order: bysource - :members: __init__, launch, poll, manager_poll, kill, set_kill_mode -.. autoclass:: BalsamJobController - :show-inheritance: - :member-order: bysource -.. :members: __init__, launch, poll, manager_poll, kill, set_kill_mode +See the controller APIs for optional arguments. +.. toctree:: + :maxdepth: 1 + :caption: Job Controllers: + + mpi_controller + balsam_controller Job Class --------- -Jobs are created and returned though the job_controller launch function. Jobs can be passed as arguments -to the job_controller poll and kill functions. Job information can be queired through the job attributes below and the query functions. Note that the job attributes are only updated when they are polled (or though other -job controller functions). +Jobs are created and returned though the job_controller launch function. Jobs can be polled and +killed with the respective poll and kill functions. Job information can be queried through the job attributes +below and the query functions. Note that the job attributes are only updated when they are +polled/killed (or through other job or job controller functions). .. autoclass:: Job - :member-order: bysource - :members: workdir_exists, file_exists_in_workdir, read_file_in_workdir, stdout_exists, read_stdout, stderr_exists, read_stderr - -.. autoclass:: BalsamJob - :show-inheritance: - :member-order: bysource -.. :members: workdir_exists, file_exists_in_workdir, read_file_in_workdir, stdout_exists, read_stdout -.. :inherited-members: + :members: + :exclude-members: calc_job_timing,check_poll +.. :member-order: bysource +.. :members: poll, kill, workdir_exists, file_exists_in_workdir, read_file_in_workdir, stdout_exists, read_stdout, stderr_exists, read_stderr Job Attributes @@ -65,7 +50,7 @@ Run configuration attributes - Some will be auto-generated: :job.workdir: (string) Work directory for the job :job.name: (string) Name of job - auto-generated -:job.app: (app obj) Use application/executable, registered using registry.register_calc +:job.app: (app obj) Use application/executable, registered using jobctl.register_calc :job.app_args: (string) Application arguments as a string :job.num_procs: (int) Total number of processors for job :job.num_nodes: (int) Number of nodes for job diff --git a/docs/job_controller/mpi_controller.rst b/docs/job_controller/mpi_controller.rst new file mode 100644 index 000000000..3fb213711 --- /dev/null +++ b/docs/job_controller/mpi_controller.rst @@ -0,0 +1,17 @@ +MPI Job Controller +================== + +To create an MPI job controller, the calling script should contain:: + + jobctl = MPIJobController() + +See the controller API below for optional arguments. + +.. automodule:: mpi_controller + :no-undoc-members: + +.. autoclass:: MPIJobController + :show-inheritance: + :inherited-members: +.. :member-order: bysource +.. :members: __init__, register_calc, launch, manager_poll diff --git a/docs/job_controller/overview.rst b/docs/job_controller/overview.rst index a3f715cd4..c96234293 100644 --- a/docs/job_controller/overview.rst +++ b/docs/job_controller/overview.rst @@ -3,33 +3,31 @@ Job Controller Overview The Job Controller module can be used by the worker or user-side code to issue and manage jobs using a portable interface. Various back-end mechanisms may be used to implement this interface on the system, either specified by the user at the top-level, or auto-detected. The job_controller manages jobs using the launch, poll and kill functions. Job attributes can then be queried to determine status. Functions are also provided to access and interrogate files in the job's working directory. -At the top-level calling script, a registry and job_controller are created and the executable gen or sim applications are registered to these (these are applications that will be runnable parallel jobs). If an alternative job_controller, such as Balsam, is to be used, then these can be created as in the example. Once in the user-side worker code (sim/gen func), the job_controller can be retrieved without any need to specify the type. +At the top-level calling script, a job_controller is created and the executable gen or sim applications are registered to it (these are applications that will be runnable jobs). If an alternative job_controller, such as Balsam, is to be used, then these can be created as in the example. Once in the user-side worker code (sim/gen func), an MPI based job_controller can be retrieved without any need to specify the specific type. **Example usage (code runnable with or without a Balsam backend):** In calling function:: - from libensemble.register import Register, BalsamRegister - from libensemble.controller import JobController, BalsamJobController sim_app = '/path/to/my/exe' USE_BALSAM = False if USE_BALSAM: - registry = BalsamRegister() - jobctrl = BalsamJobController(registry = registry) + from libensemble.balsam_controller import BalsamJobController + jobctrl = BalsamJobController() else: - registry = Register() - jobctrl = JobController(registry = registry) + from libensemble.mpi_controller import MPIJobController + jobctrl = MPIJobController() - registry.register_calc(full_path=sim_app, calc_type='sim') + jobctrl.register_calc(full_path=sim_app, calc_type='sim') In user sim func:: - from libensemble.controller import JobController + jobctl = MPIJobController.controller # This will work for inherited controllers also (eg. Balsam) import time - jobctl = JobController.controller #Will return controller (whether Balsam or standard). - job = jobctl.launch(calc_type='sim', num_procs=8, app_args='input.txt', stdout='out.txt') + jobctl = MPIJobController.controller # Will return controller (whether Balsam or standard MPI). + job = jobctl.launch(calc_type='sim', num_procs=8, app_args='input.txt', stdout='out.txt', stderr='err.txt') while time.time() - start < timeout_sec: time.sleep(delay) @@ -37,10 +35,10 @@ In user sim func:: # Has manager sent a finish signal jobctl.manager_poll() if jobctl.manager_signal == 'finish': - jobctl.kill(job) + job.kill() # Poll job to see if completed - jobctl.poll(job) + job.poll() if job.finished: print(job.state) break @@ -48,7 +46,7 @@ In user sim func:: # Check output file for error and kill job if job.stdout_exists(): if 'Error' in job.read_stdout(): - jobctl.kill(job) + job.kill() break See the :doc:`job_controller` interface for API. diff --git a/docs/job_controller/register.rst b/docs/job_controller/register.rst deleted file mode 100644 index 08433dd5e..000000000 --- a/docs/job_controller/register.rst +++ /dev/null @@ -1,19 +0,0 @@ -Registration Module -=================== - -.. automodule:: register - :no-undoc-members: - -See :doc:`example` for usage. - -.. autoclass:: register.Register - :member-order: bysource - :members: __init__, register_calc - -.. autoclass:: register.BalsamRegister - :show-inheritance: - :member-order: bysource - :members: __init__, register_calc - -.. .. autoclass:: register.Application -.. :members: __init__ diff --git a/libensemble/balsam_controller.py b/libensemble/balsam_controller.py index fbb67e020..a9e0506e6 100644 --- a/libensemble/balsam_controller.py +++ b/libensemble/balsam_controller.py @@ -117,7 +117,7 @@ def kill(self, wait_time=None): class BalsamJobController(MPIJobController): - """Inherits from JobController and wraps the Balsam job management service + """Inherits from MPIJobController and wraps the Balsam job management service .. note:: Job kills are not configurable in the Balsam job_controller. diff --git a/libensemble/controller.py b/libensemble/controller.py index ad9a30a1a..2f87a6498 100644 --- a/libensemble/controller.py +++ b/libensemble/controller.py @@ -1,11 +1,12 @@ """ Module to launch and control running jobs. -Contains job_controller and job. Inherited classes include MPI and Balsam -variants. A job_controller can create and manage multiple jobs. The worker or -user-side code can issue and manage jobs using the launch, poll and kill -functions. Job attributes are queried to determine status. Functions are also -provided to access and interrogate files in the job's working directory. +Contains job_controller and job. The class JobController is a base class and not +intended for direct use. Instead one of the inherited classes should be used. Inherited +classes include MPI and Balsam variants. A job_controller can create and manage +multiple jobs. The worker or user-side code can issue and manage jobs using the launch, +poll and kill functions. Job attributes are queried to determine status. Functions are +also provided to access and interrogate files in the job's working directory. """ @@ -76,7 +77,7 @@ def __init__(self, app=None, app_args=None, workdir=None, self.id = next(Job.newid) #Status attributes - self.state = 'CREATED' #: test1 docstring + self.state = 'CREATED' self.process = None self.errcode = None self.finished = False # True means job ran, not that it succeeded @@ -325,6 +326,10 @@ def set_workerID(self, workerid): """Sets the worker ID for this job_controller""" self.workerID = workerid + def poll(self, job): + "Polls a job" + job.poll(job) + def kill(self, job): "Kill a job" jassert(isinstance(job, Job), "Invalid job has been provided") From b13762d6034a64659acdcfbcdd2c968e08ad3924 Mon Sep 17 00:00:00 2001 From: shudson Date: Wed, 7 Nov 2018 14:36:10 -0600 Subject: [PATCH 098/101] Other doc updates --- docs/dev_guide/worker_module.rst | 3 --- libensemble/libE.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/dev_guide/worker_module.rst b/docs/dev_guide/worker_module.rst index e8a01d2dc..de4ea78d5 100644 --- a/docs/dev_guide/worker_module.rst +++ b/docs/dev_guide/worker_module.rst @@ -3,6 +3,3 @@ Worker Module .. automodule:: libE_worker :members: worker_main, Worker -.. autoclass:: Worker - :member-order: bysource - :members: init_workers, __init__, run, clean diff --git a/libensemble/libE.py b/libensemble/libE.py index b5477d00d..7c1769ccc 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -116,7 +116,7 @@ def libE(sim_specs, gen_specs, exit_criteria, persis_info={}, exit_flag: :obj:`int` Flag containing job status: 0 = No errors, - 1 = Exception occured and MPI aborted, + 1 = Exception occured 2 = Manager timed out and ended simulation """ From 46eff812c36246b9d94d2f0632b3d17e5f46a41c Mon Sep 17 00:00:00 2001 From: shudson Date: Wed, 7 Nov 2018 14:49:52 -0600 Subject: [PATCH 099/101] Update example submission scripts - including central mode bebop script --- .../bebop_submit_slurm.sh | 6 ++- .../bebop_submit_slurm_centralmode.sh | 52 +++++++++++++++++++ .../theta_submit_balsam.sh | 6 +-- 3 files changed, 60 insertions(+), 4 deletions(-) create mode 100644 examples/job_submission_scripts/bebop_submit_slurm_centralmode.sh diff --git a/examples/job_submission_scripts/bebop_submit_slurm.sh b/examples/job_submission_scripts/bebop_submit_slurm.sh index 49952e259..d73de4fa5 100644 --- a/examples/job_submission_scripts/bebop_submit_slurm.sh +++ b/examples/job_submission_scripts/bebop_submit_slurm.sh @@ -15,7 +15,11 @@ export EXE=libE_calling_script.py export NUM_WORKERS=4 export MANAGER_NODE=false #true = Manager has a dedicated node (use one extra node for SBATCH -N) -export I_MPI_FABRICS=shm:ofa + +unset I_MPI_FABRICS +export I_MPI_FABRICS_LIST=tmi,tcp +export I_MPI_FALLBACK=1 + #If using in calling script (After N mins manager kills workers and timing.dat created.) export LIBE_WALLCLOCK=55 diff --git a/examples/job_submission_scripts/bebop_submit_slurm_centralmode.sh b/examples/job_submission_scripts/bebop_submit_slurm_centralmode.sh new file mode 100644 index 000000000..d852b158f --- /dev/null +++ b/examples/job_submission_scripts/bebop_submit_slurm_centralmode.sh @@ -0,0 +1,52 @@ +#!/bin/bash +#SBATCH -J libE_test_central +#SBATCH -N 5 +#SBATCH -p knlall +##SBATCH -A +#SBATCH -o tlib.%j.%N.out +#SBATCH -e tlib.%j.%N.error +#SBATCH -t 01:00:00 + +#Launch script for running in central mode. +#LibEnsemble will run on a dedicated node (or nodes). +#The remaining nodes in the allocation will be dedicated to the jobs launched by the workers. + +#Requirements for running: +# Must use job_controller with auto-resources=True and central_mode=True. +# Note: Requires a schedular having an environment variable giving a global nodelist in a supported format (eg. SLURM/COBALT) +# Otherwise a worker_list file will be required. + +#Currently requires even distribution - either multiple workers per node or nodes per worker + + +#User to edit these variables +export EXE=libE_calling_script.py +export NUM_WORKERS=4 + +export I_MPI_FABRICS=shm:tmi + +#If using in calling script (After N mins manager kills workers and timing.dat created.) +export LIBE_WALLCLOCK=55 + +#--------------------------------------------------------------------------------------------- +#Test +echo -e "Slurm job ID: $SLURM_JOBID" + +#cd $PBS_O_WORKDIR +cd $SLURM_SUBMIT_DIR + +# A little useful information for the log file... +echo -e "Master process running on: $HOSTNAME" +echo -e "Directory is: $PWD" + +#This will work for the number of contexts that will fit on one node (eg. 320 on Bebop) - increase libE nodes for more. +cmd="srun --overcommit --ntasks=$(($NUM_WORKERS+1)) --nodes=1 python $EXE $LIBE_WALLCLOCK" + +echo The command is: $cmd +echo End PBS script information. +echo All further output is from the process being run and not the pbs script.\n\n $cmd # Print the date again -- when finished + +$cmd + +# Print the date again -- when finished +echo Finished at: `date` diff --git a/examples/job_submission_scripts/theta_submit_balsam.sh b/examples/job_submission_scripts/theta_submit_balsam.sh index a63670980..89a85a172 100644 --- a/examples/job_submission_scripts/theta_submit_balsam.sh +++ b/examples/job_submission_scripts/theta_submit_balsam.sh @@ -79,15 +79,15 @@ SCRIPT_BASENAME=${EXE%.*} balsam app --name $SCRIPT_BASENAME.app --exec $EXE --desc "Run $SCRIPT_BASENAME" # Running libE on one node - one manager and upto 63 workers -balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS --wall-time-minutes $LIBE_WALLCLOCK --num-nodes 1 --ranks-per-node $((NUM_WORKERS+1)) --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.dat" --url-in="local:/$THIS_DIR/*" --yes +balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS --wall-time-minutes $LIBE_WALLCLOCK --num-nodes 1 --ranks-per-node $((NUM_WORKERS+1)) --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.txt *.log" --url-in="local:/$THIS_DIR/*" --yes # Hyper-thread libE (note this will not affect HT status of user calcs - only libE itself) # Running 255 workers and one manager on one libE node. -# balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS --wall-time-minutes $LIBE_WALLCLOCK --num-nodes 1 --ranks-per-node 256 --threads-per-core 4 --url-out="local:/$THIS_DIR/*" --stage-out-files="*.out *.dat" --url-in="local:/$THIS_DIR" --yes +# balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS --wall-time-minutes $LIBE_WALLCLOCK --num-nodes 1 --ranks-per-node 256 --threads-per-core 4 --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.txt *.log" --url-in="local:/$THIS_DIR/*" --yes # Multiple nodes for libE # Running 127 workers and one manager - launch script on 129 nodes (if one node per worker) -# balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS --wall-time-minutes $LIBE_WALLCLOCK --num-nodes 2 --ranks-per-node 64 --url-out="local:/$THIS_DIR/*" --stage-out-files="*.out *.dat" --url-in="local:/$THIS_DIR" --yes +# balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS --wall-time-minutes $LIBE_WALLCLOCK --num-nodes 2 --ranks-per-node 64 --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.txt *.log" --url-in="local:/$THIS_DIR/*" --yes #Run job balsam launcher --consume-all --job-mode=mpi --num-transition-threads=1 From dae6f2c85a7b3b13f2f4f1e7d6831d3eebef36f0 Mon Sep 17 00:00:00 2001 From: shudson Date: Wed, 7 Nov 2018 15:13:13 -0600 Subject: [PATCH 100/101] Add fast alloc tests to examples --- examples/alloc_funcs/fast_alloc.py | 1 + examples/alloc_funcs/fast_alloc_to_aposmm.py | 1 + examples/calling_scripts/test_fast_alloc.py | 1 + 3 files changed, 3 insertions(+) create mode 120000 examples/alloc_funcs/fast_alloc.py create mode 120000 examples/alloc_funcs/fast_alloc_to_aposmm.py create mode 120000 examples/calling_scripts/test_fast_alloc.py diff --git a/examples/alloc_funcs/fast_alloc.py b/examples/alloc_funcs/fast_alloc.py new file mode 120000 index 000000000..dd8fd3421 --- /dev/null +++ b/examples/alloc_funcs/fast_alloc.py @@ -0,0 +1 @@ +../../libensemble/alloc_funcs/fast_alloc.py \ No newline at end of file diff --git a/examples/alloc_funcs/fast_alloc_to_aposmm.py b/examples/alloc_funcs/fast_alloc_to_aposmm.py new file mode 120000 index 000000000..0ef92bad4 --- /dev/null +++ b/examples/alloc_funcs/fast_alloc_to_aposmm.py @@ -0,0 +1 @@ +../../libensemble/alloc_funcs/fast_alloc_to_aposmm.py \ No newline at end of file diff --git a/examples/calling_scripts/test_fast_alloc.py b/examples/calling_scripts/test_fast_alloc.py new file mode 120000 index 000000000..d81cdd12e --- /dev/null +++ b/examples/calling_scripts/test_fast_alloc.py @@ -0,0 +1 @@ +../../libensemble/tests/regression_tests/test_fast_alloc.py \ No newline at end of file From bcd7df9fce6378708d193eee0dfe48a13efb3e8c Mon Sep 17 00:00:00 2001 From: shudson Date: Wed, 7 Nov 2018 15:25:53 -0600 Subject: [PATCH 101/101] Add release notes for v0.4.0 --- docs/release_notes.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 31c621bc4..2469a97ef 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -2,6 +2,23 @@ Release Notes ============= +Release 0.4.0 +------------- + +:Date: November 7, 2018 + +* Separate job controller classes into different modules including a base class (API change) +* Add central_mode run option to distributed type (MPI) job_controllers (API addition) (#93) +* Make poll and kill job methods (API change) +* In job_controller, set_kill_mode is removed and replaced by a wait argument for a hard kill (API change) +* Removed register module - incorporated into job_controller (API change) +* APOSMM has improved asynchronicity when batch mode is false (with new example). (#96) +* Manager errors (instead of hangs) when alloc_f or gen_f don't return work when all workers are idle. (#95) + +:Known issues: + +* OpenMPI is not supported with direct MPI launches as nested MPI launches are not supported. + Release 0.3.0 -------------