From 9e70cb5256cdc87ebf6c77b3721faa9f309b5cf6 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 3 Dec 2021 17:05:52 -0600 Subject: [PATCH 01/93] initial commit for attempting new balsam executor. very WIP --- libensemble/executors/bexample.py | 29 ++ libensemble/executors/executor.py | 3 +- libensemble/executors/new_balsam_executor.py | 341 +++++++++++++++++++ 3 files changed, 372 insertions(+), 1 deletion(-) create mode 100644 libensemble/executors/bexample.py create mode 100644 libensemble/executors/new_balsam_executor.py diff --git a/libensemble/executors/bexample.py b/libensemble/executors/bexample.py new file mode 100644 index 000000000..afda7f0b0 --- /dev/null +++ b/libensemble/executors/bexample.py @@ -0,0 +1,29 @@ +from balsam.api import ApplicationDefinition, BatchJob, Job +import time + +class VecNorm(ApplicationDefinition): + site = "one" + + def run(self, vec): + return sum(x**2 for x in vec)**0.5 + +job = VecNorm.submit(workdir="test/1", vec=[3, 4]) + +batchjob = BatchJob.objects.create( + site_id=job.site_id, + num_nodes=1, + wall_time_min=10, + job_mode="mpi", + queue="local", + project="local", +) + +import ipdb; ipdb.set_trace() + + + +print('hello') +print(job.result()) + +for job in Job.objects.as_completed(jobs): + print(job.workdir, job.result()) diff --git a/libensemble/executors/executor.py b/libensemble/executors/executor.py index 477a0accf..071e90dca 100644 --- a/libensemble/executors/executor.py +++ b/libensemble/executors/executor.py @@ -77,7 +77,7 @@ class Application: prefix = 'libe_app' - def __init__(self, full_path, name=None, calc_type='sim', desc=None): + def __init__(self, full_path, name=None, calc_type='sim', desc=None, site=None): """Instantiates a new Application instance.""" self.full_path = full_path self.calc_type = calc_type @@ -87,6 +87,7 @@ def __init__(self, full_path, name=None, calc_type='sim', desc=None): self.full_path = ' '.join((sys.executable, full_path)) self.name = name or self.exe self.desc = desc or (self.exe + ' app') + self.site = site or None self.gname = '_'.join([Application.prefix, self.name]) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py new file mode 100644 index 000000000..7d17aa4cd --- /dev/null +++ b/libensemble/executors/new_balsam_executor.py @@ -0,0 +1,341 @@ +""" +This module launches and controls the running of tasks with Balsam. + +.. note:: Balsam is supported only when using ``mpi`` comms and requires Python 3.6 or higher. + +In order to create a Balsam executor, the calling script should contain :: + + exctr = BalsamMPIExecutor() + +The Balsam executor inherits from the MPI executor. See the +:doc:`MPIExecutor` for shared API. Any differences are +shown below. + +""" + +import os +import logging +import time +import datetime + +from libensemble.resources import mpi_resources +from libensemble.executors.executor import \ + Application, Task, ExecutorException, TimeoutExpired, jassert, STATES +from libensemble.executors.mpi_executor import MPIExecutor +from libensemble.executors.executor import Application + +from balsam.api import ApplicationDefinition, BatchJob, Job, EventLog + +logger = logging.getLogger(__name__) +# To change logging level for just this module +# logger.setLevel(logging.DEBUG) + +class BalsamTask(Task): + """Wraps a Balsam Task from the Balsam service + + The same attributes and query routines are implemented. + + """ + + def __init__(self, app=None, app_args=None, workdir=None, + stdout=None, stderr=None, workerid=None): + """Instantiate a new BalsamTask instance. + + A new BalsamTask object is created with an id, status and + configuration attributes. This will normally be created by the + executor on a submission. + """ + # May want to override workdir with Balsam value when it exists + Task.__init__(self, app, app_args, workdir, stdout, stderr, workerid) + + def _get_time_since_balsam_submit(self): + """Return time since balsam task entered RUNNING state""" + + # If wait_on_start then can could calculate runtime same a base executor + # but otherwise that will return time from task submission. Get from Balsam. + + # self.runtime = self.process.runtime_seconds # Only reports at end of run currently + # balsam_launch_datetime = self.process.get_state_times().get('RUNNING', None) + balsam_launch_datetime = EventLog.objects.filter( + job_id=self.process.job_id, to_state="RUNNING").timestamp + current_datetime = datetime.datetime.now() + if balsam_launch_datetime: + return (current_datetime - balsam_launch_datetime).total_seconds() + else: + return 0 + + def calc_task_timing(self): + """Calculate timing information for this task""" + + # Get runtime from Balsam + self.runtime = self._get_time_since_balsam_submit() + + if self.submit_time is None: + logger.warning("Cannot calc task total_time - submit time not set") + return + + if self.total_time is None: + self.total_time = time.time() - self.submit_time + + def _set_complete(self, dry_run=False): + """Set task as complete""" + self.finished = True + if dry_run: + self.success = True + self.state = 'FINISHED' + else: + balsam_state = self.process.state + self.workdir = self.workdir or self.process.working_directory + self.calc_task_timing() + self.success = (balsam_state == 'JOB_FINISHED') + if balsam_state == 'JOB_FINISHED': + self.state = 'FINISHED' + elif balsam_state == 'PARENT_KILLED': # Not currently used + self.state = 'USER_KILLED' + elif balsam_state in STATES: # In my states + self.state = balsam_state + else: + logger.warning("Task finished, but in unrecognized " + "Balsam state {}".format(balsam_state)) + self.state = 'UNKNOWN' + + logger.info("Task {} ended with state {}". + format(self.name, self.state)) + + def poll(self): + """Polls and updates the status attributes of the supplied task""" + if self.dry_run: + return + + if not self._check_poll(): + return + + # Get current state of tasks from Balsam database + self.process.refresh_from_db() + balsam_state = self.process.state + self.runtime = self._get_time_since_balsam_submit() + + if balsam_state in models.END_STATES: + self._set_complete() + + elif balsam_state in models.ACTIVE_STATES: + self.state = 'RUNNING' + self.workdir = self.workdir or self.process.working_directory + + elif (balsam_state in models.PROCESSABLE_STATES or + balsam_state in models.RUNNABLE_STATES): + self.state = 'WAITING' + + else: + raise ExecutorException( + "Task state returned from Balsam is not in known list of " + "Balsam states. Task state is {}".format(balsam_state)) + + def wait(self, timeout=None): + """Waits on completion of the task or raises TimeoutExpired exception + + Status attributes of task are updated on completion. + + Parameters + ---------- + + timeout: + Time in seconds after which a TimeoutExpired exception is raised""" + + if self.dry_run: + return + + if not self._check_poll(): + return + + # Wait on the task + start = time.time() + self.process.refresh_from_db() + while self.process.state not in models.END_STATES: + time.sleep(0.2) + self.process.refresh_from_db() + if timeout and time.time() - start > timeout: + self.runtime = self._get_time_since_balsam_submit() + raise TimeoutExpired(self.name, timeout) + + self.runtime = self._get_time_since_balsam_submit() + self._set_complete() + + def kill(self, wait_time=None): + """ Kills or cancels the supplied task """ + + dag.kill(self.process) + + # Could have Wait here and check with Balsam its killed - + # but not implemented yet. + + logger.info("Killing task {}".format(self.name)) + self.state = 'USER_KILLED' + self.finished = True + self.calc_task_timing() + +class NewBalsamMPIExecutor(MPIExecutor): + """Inherits from MPIExecutor and wraps the Balsam task management service + + .. note:: Task kills are not configurable in the Balsam executor. + + """ + def __init__(self, custom_info={}): + """Instantiate a new BalsamMPIExecutor instance. + + A new BalsamMPIExecutor object is created with an application + registry and configuration attributes + """ + + if custom_info: + logger.warning("The Balsam executor does not support custom_info - ignoring") + + super().__init__(custom_info) + + self.workflow_name = "libe_workflow" + self.application_objs = {} + + def serial_setup(self): + """Balsam serial setup includes empyting database and adding applications""" + + for app in self.apps.values(): + calc_name = app.gname + desc = app.desc + full_path = app.full_path + site = app.site + self.application_objs[calc_name] = self.add_app(calc_name, site, full_path, desc) + + + def add_app(name, site, exepath, desc): + """ Sync application with balsam service """ + + class BalsamApplication(ApplicationDefinition): + site = site + command_template=exepath + + BalsamApplication.sync() + logger.debug("Added App {}".format(name)) + return BalsamApplication + + def register_app(self, full_path, site, app_name=None, calc_type=None, desc=None): + """Registers a user application to libEnsemble. + + The ``full_path`` of the application must be supplied. Either + ``app_name`` or ``calc_type`` can be used to identify the + application in user scripts (in the **submit** function). + ``app_name`` is recommended. + + Parameters + ---------- + + full_path: String + The full path of the user application to be registered + + site: String + The Balsam site name for where to launch the app + + app_name: String, optional + Name to identify this application. + + calc_type: String, optional + Calculation type: Set this application as the default 'sim' + or 'gen' function. + + desc: String, optional + Description of this application + + """ + if not app_name: + app_name = os.path.split(full_path)[1] + self.apps[app_name] = Application(full_path, app_name, calc_type, desc, site) + + # Default sim/gen apps will be deprecated. Just use names. + if calc_type is not None: + jassert(calc_type in self.default_apps, + "Unrecognized calculation type", calc_type) + self.default_apps[calc_type] = self.apps[app_name] + + def set_resources(self, resources): + self.resources = resources + + def submit(self, calc_type=None, app_name=None, num_procs=None, + num_nodes=None, procs_per_node=None, machinefile=None, + app_args=None, stdout=None, stderr=None, stage_inout=None, + hyperthreads=False, dry_run=False, wait_on_start=False, queue=None, + project=None, wall_time_min=None, extra_args=''): + """Creates a new task, and either executes or schedules to execute + in the executor + + The created task object is returned. + """ + + if app_name is not None: + app = self.get_app(app_name) + elif calc_type is not None: + app = self.default_app(calc_type) + else: + raise ExecutorException("Either app_name or calc_type must be set") + + # Specific to this class + if machinefile is not None: + logger.warning("machinefile arg ignored - not supported in Balsam") + jassert(num_procs or num_nodes or procs_per_node, + "No procs/nodes provided - aborting") + + num_procs, num_nodes, procs_per_node = \ + mpi_resources.task_partition(num_procs, num_nodes, procs_per_node) + + if stdout is not None or stderr is not None: + logger.warning("Balsam does not currently accept a stdout " + "or stderr name - ignoring") + stdout = None + stderr = None + + # Will be possible to override with arg when implemented + # (or can have option to let Balsam assign) + default_workdir = os.getcwd() + task = BalsamTask(app, app_args, default_workdir, + stdout, stderr, self.workerID) + + add_task_args = {'name': task.name, + 'workflow': self.workflow_name, + 'user_workdir': default_workdir, + 'application': app.gname, + 'args': task.app_args, + 'num_nodes': num_nodes, + 'procs_per_node': procs_per_node, + 'mpi_flags': extra_args} + + if dry_run: + task.dry_run = True + logger.info('Test (No submit) to Balsam: {}'.format(' '.join(add_task_args))) + task._set_complete(dry_run=True) + else: + + balsam_app_obj = self.application_objs[task.name] + task.process = balsam_app_obj.submit(workdir=self.workflow_name) + + task.batchjob = BatchJob.objects.create( + site_id=task.process.site_id, + num_nodes=num_nodes, + ranks_per_node=procs_per_node, + wall_time_min=wall_time_min, + job_mode="mpi", + queue=queue, + project=project + ) + + if (wait_on_start): + self._wait_on_start(task) + + if not task.timer.timing: + task.timer.start() + task.submit_time = task.timer.tstart # Time not date - may not need if using timer. + + logger.info("Added task to Balsam database {}: " + "nodes {} ppn {}". + format(task.name, num_nodes, procs_per_node)) + + # task.workdir = task.process.working_directory # Might not be set yet! + self.list_of_tasks.append(task) + return task From 63128e570cf2f128295b58f03c07cfb18b13a65d Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 7 Dec 2021 14:22:29 -0600 Subject: [PATCH 02/93] update poll, wait, and kill for new balsam states. remove test file. flake8 --- libensemble/executors/bexample.py | 29 -------------------- libensemble/executors/new_balsam_executor.py | 26 +++++++++--------- 2 files changed, 13 insertions(+), 42 deletions(-) delete mode 100644 libensemble/executors/bexample.py diff --git a/libensemble/executors/bexample.py b/libensemble/executors/bexample.py deleted file mode 100644 index afda7f0b0..000000000 --- a/libensemble/executors/bexample.py +++ /dev/null @@ -1,29 +0,0 @@ -from balsam.api import ApplicationDefinition, BatchJob, Job -import time - -class VecNorm(ApplicationDefinition): - site = "one" - - def run(self, vec): - return sum(x**2 for x in vec)**0.5 - -job = VecNorm.submit(workdir="test/1", vec=[3, 4]) - -batchjob = BatchJob.objects.create( - site_id=job.site_id, - num_nodes=1, - wall_time_min=10, - job_mode="mpi", - queue="local", - project="local", -) - -import ipdb; ipdb.set_trace() - - - -print('hello') -print(job.result()) - -for job in Job.objects.as_completed(jobs): - print(job.workdir, job.result()) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 7d17aa4cd..90763f450 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -22,14 +22,14 @@ from libensemble.executors.executor import \ Application, Task, ExecutorException, TimeoutExpired, jassert, STATES from libensemble.executors.mpi_executor import MPIExecutor -from libensemble.executors.executor import Application -from balsam.api import ApplicationDefinition, BatchJob, Job, EventLog +from balsam.api import ApplicationDefinition, BatchJob, EventLog logger = logging.getLogger(__name__) # To change logging level for just this module # logger.setLevel(logging.DEBUG) + class BalsamTask(Task): """Wraps a Balsam Task from the Balsam service @@ -115,17 +115,20 @@ def poll(self): balsam_state = self.process.state self.runtime = self._get_time_since_balsam_submit() - if balsam_state in models.END_STATES: + if balsam_state in ['RUN_DONE', 'POSTPROCESSED', 'STAGED_OUT', "JOB_FINISHED"]: self._set_complete() - elif balsam_state in models.ACTIVE_STATES: + elif balsam_state in ['RUNNING']: self.state = 'RUNNING' self.workdir = self.workdir or self.process.working_directory - elif (balsam_state in models.PROCESSABLE_STATES or - balsam_state in models.RUNNABLE_STATES): + elif balsam_state in ['CREATED', 'AWAITING_PARENTS', + 'READY', 'STAGED_IN', 'PREPROCESSED']: self.state = 'WAITING' + elif balsam_state in ['RUN_ERROR', 'RUN_TIMEOUT', 'FAILED']: + self.state = 'FAILED' + else: raise ExecutorException( "Task state returned from Balsam is not in known list of " @@ -151,7 +154,7 @@ def wait(self, timeout=None): # Wait on the task start = time.time() self.process.refresh_from_db() - while self.process.state not in models.END_STATES: + while self.process.state not in ['RUN_DONE', 'POSTPROCESSED', 'STAGED_OUT', "JOB_FINISHED"]: time.sleep(0.2) self.process.refresh_from_db() if timeout and time.time() - start > timeout: @@ -164,16 +167,14 @@ def wait(self, timeout=None): def kill(self, wait_time=None): """ Kills or cancels the supplied task """ - dag.kill(self.process) - - # Could have Wait here and check with Balsam its killed - - # but not implemented yet. + self.process.delete() logger.info("Killing task {}".format(self.name)) self.state = 'USER_KILLED' self.finished = True self.calc_task_timing() + class NewBalsamMPIExecutor(MPIExecutor): """Inherits from MPIExecutor and wraps the Balsam task management service @@ -205,13 +206,12 @@ def serial_setup(self): site = app.site self.application_objs[calc_name] = self.add_app(calc_name, site, full_path, desc) - def add_app(name, site, exepath, desc): """ Sync application with balsam service """ class BalsamApplication(ApplicationDefinition): site = site - command_template=exepath + command_template = exepath BalsamApplication.sync() logger.debug("Added App {}".format(name)) From 8103bae4a956efd892dbdf3557dd62598cf10f0d Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 14 Dec 2021 12:10:56 -0600 Subject: [PATCH 03/93] intermediate work with forces+balsam, move AppDef class --- libensemble/executors/new_balsam_executor.py | 31 ++++++++++--------- .../tests/scaling_tests/forces/forces_simf.py | 21 +++++++------ .../scaling_tests/forces/run_libe_forces.py | 11 ++++--- 3 files changed, 36 insertions(+), 27 deletions(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 90763f450..3b6249c76 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -29,6 +29,10 @@ # To change logging level for just this module # logger.setLevel(logging.DEBUG) +class libEBalsamApplication(ApplicationDefinition): + site = "default_site" + command_template = "default_path" + class BalsamTask(Task): """Wraps a Balsam Task from the Balsam service @@ -57,7 +61,7 @@ def _get_time_since_balsam_submit(self): # self.runtime = self.process.runtime_seconds # Only reports at end of run currently # balsam_launch_datetime = self.process.get_state_times().get('RUNNING', None) balsam_launch_datetime = EventLog.objects.filter( - job_id=self.process.job_id, to_state="RUNNING").timestamp + job_id=self.process.id, to_state="RUNNING") current_datetime = datetime.datetime.now() if balsam_launch_datetime: return (current_datetime - balsam_launch_datetime).total_seconds() @@ -204,18 +208,15 @@ def serial_setup(self): desc = app.desc full_path = app.full_path site = app.site - self.application_objs[calc_name] = self.add_app(calc_name, site, full_path, desc) + self.add_app(calc_name, site, full_path, desc) - def add_app(name, site, exepath, desc): + def add_app(self, name, site, exepath, desc): """ Sync application with balsam service """ - class BalsamApplication(ApplicationDefinition): - site = site - command_template = exepath - - BalsamApplication.sync() + libEBalsamApplication.site = site + libEBalsamApplication.command_template = exepath + libEBalsamApplication.sync() logger.debug("Added App {}".format(name)) - return BalsamApplication def register_app(self, full_path, site, app_name=None, calc_type=None, desc=None): """Registers a user application to libEnsemble. @@ -261,8 +262,8 @@ def set_resources(self, resources): def submit(self, calc_type=None, app_name=None, num_procs=None, num_nodes=None, procs_per_node=None, machinefile=None, app_args=None, stdout=None, stderr=None, stage_inout=None, - hyperthreads=False, dry_run=False, wait_on_start=False, queue=None, - project=None, wall_time_min=None, extra_args=''): + hyperthreads=False, gpus_per_rank=0, dry_run=False, wait_on_start=False, + queue=None, project=None, wall_time_min=None, extra_args=''): """Creates a new task, and either executes or schedules to execute in the executor @@ -312,13 +313,15 @@ def submit(self, calc_type=None, app_name=None, num_procs=None, task._set_complete(dry_run=True) else: - balsam_app_obj = self.application_objs[task.name] - task.process = balsam_app_obj.submit(workdir=self.workflow_name) + task.process = libEBalsamApplication.submit(workdir=self.workflow_name, + num_nodes=num_nodes, + ranks_per_node=procs_per_node, + wall_time_min=wall_time_min, + gpus_per_rank=gpus_per_rank) task.batchjob = BatchJob.objects.create( site_id=task.process.site_id, num_nodes=num_nodes, - ranks_per_node=procs_per_node, wall_time_min=wall_time_min, job_mode="mpi", queue=queue, diff --git a/libensemble/tests/scaling_tests/forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/forces_simf.py index 3dbb350bb..2a89523d2 100644 --- a/libensemble/tests/scaling_tests/forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/forces/forces_simf.py @@ -78,15 +78,18 @@ def run_forces(H, persis_info, sim_specs, libE_info): if sim_specs['user']['fail_on_submit']: machinefile = 'fail' - # Machinefile only used here for exception testing - if cores: - task = exctr.submit(app_name='forces', num_procs=cores, app_args=args, - stdout='out.txt', stderr='err.txt', wait_on_start=True, - machinefile=machinefile) - else: - task = exctr.submit(app_name='forces', app_args=args, stdout='out.txt', - stderr='err.txt', wait_on_start=True, hyperthreads=True, - machinefile=machinefile) # Auto-partition + # # Machinefile only used here for exception testing + # if cores: + # task = exctr.submit(app_name='forces', num_procs=cores, app_args=args, + # stdout='out.txt', stderr='err.txt', wait_on_start=True, + # machinefile=machinefile) + # else: + # task = exctr.submit(app_name='forces', app_args=args, stdout='out.txt', + # stderr='err.txt', wait_on_start=True, hyperthreads=True, + # machinefile=machinefile) # Auto-partition + + task = exctr.submit(app_name='forces', num_procs=cores, app_args=args, wait_on_start=True, + queue='local', project='local', wall_time_min=10) # Auto-partition # Stat file to check for bad runs statfile = 'forces.stat' diff --git a/libensemble/tests/scaling_tests/forces/run_libe_forces.py b/libensemble/tests/scaling_tests/forces/run_libe_forces.py index d578dd43f..9bb019dfb 100644 --- a/libensemble/tests/scaling_tests/forces/run_libe_forces.py +++ b/libensemble/tests/scaling_tests/forces/run_libe_forces.py @@ -10,7 +10,7 @@ from libensemble import logger from forces_support import test_libe_stats, test_ensemble_dir, check_log_exception -USE_BALSAM = False +USE_BALSAM = True PERSIS_GEN = False if PERSIS_GEN: @@ -38,12 +38,15 @@ # Create executor and register sim to it. if USE_BALSAM: - from libensemble.executors.balsam_executor import BalsamMPIExecutor - exctr = BalsamMPIExecutor() + from libensemble.executors.new_balsam_executor import NewBalsamMPIExecutor + exctr = NewBalsamMPIExecutor() + exctr.register_app(full_path=sim_app, site='three', app_name='forces') + else: from libensemble.executors.mpi_executor import MPIExecutor exctr = MPIExecutor() -exctr.register_app(full_path=sim_app, app_name='forces') + exctr.register_app(full_path=sim_app, app_name='forces') + # Note: Attributes such as kill_rate are to control forces tests, this would not be a typical parameter. From 7ea61bd83c12a7471a27284f5c65a213a22f5c4c Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 31 Jan 2022 11:30:54 -0600 Subject: [PATCH 04/93] initial commit for balsam-forces, small adjustments in preparation for passing AppDef instances instead --- libensemble/executors/__init__.py | 5 +- libensemble/executors/new_balsam_executor.py | 11 +- .../balsam_forces/balsam_forces.yaml | 44 +++++++ .../balsam_forces/build_forces.sh | 39 ++++++ .../scaling_tests/balsam_forces/cleanup.sh | 1 + .../balsam_forces/forces_simf.py | 114 ++++++++++++++++++ .../scaling_tests/balsam_forces/readme.md | 55 +++++++++ .../balsam_forces/run_libe_forces_balsam.py | 31 +++++ 8 files changed, 290 insertions(+), 10 deletions(-) create mode 100644 libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml create mode 100644 libensemble/tests/scaling_tests/balsam_forces/build_forces.sh create mode 100644 libensemble/tests/scaling_tests/balsam_forces/cleanup.sh create mode 100644 libensemble/tests/scaling_tests/balsam_forces/forces_simf.py create mode 100644 libensemble/tests/scaling_tests/balsam_forces/readme.md create mode 100644 libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py diff --git a/libensemble/executors/__init__.py b/libensemble/executors/__init__.py index 45584a513..850484c36 100644 --- a/libensemble/executors/__init__.py +++ b/libensemble/executors/__init__.py @@ -1,9 +1,12 @@ from libensemble.executors.executor import Executor from libensemble.executors.mpi_executor import MPIExecutor +from libensemble.executors.new_balsam_executor import NewBalsamMPIExecutor import os import sys if 'BALSAM_DB_PATH' in os.environ and int(sys.version[2]) >= 6: from libensemble.executors.balsam_executor import BalsamMPIExecutor -__all__ = ['BalsamMPIExecutor', 'Executor', 'MPIExecutor'] + + +__all__ = ['BalsamMPIExecutor', 'Executor', 'MPIExecutor', 'NewBalsamMPIExecutor'] diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 3b6249c76..d9a81a079 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -29,10 +29,6 @@ # To change logging level for just this module # logger.setLevel(logging.DEBUG) -class libEBalsamApplication(ApplicationDefinition): - site = "default_site" - command_template = "default_path" - class BalsamTask(Task): """Wraps a Balsam Task from the Balsam service @@ -218,8 +214,8 @@ def add_app(self, name, site, exepath, desc): libEBalsamApplication.sync() logger.debug("Added App {}".format(name)) - def register_app(self, full_path, site, app_name=None, calc_type=None, desc=None): - """Registers a user application to libEnsemble. + def register_app(self, balsam_app, app_name=None, calc_type=None, desc=None): + """Registers a Balsam application instance to libEnsemble. The ``full_path`` of the application must be supplied. Either ``app_name`` or ``calc_type`` can be used to identify the @@ -232,9 +228,6 @@ def register_app(self, full_path, site, app_name=None, calc_type=None, desc=None full_path: String The full path of the user application to be registered - site: String - The Balsam site name for where to launch the app - app_name: String, optional Name to identify this application. diff --git a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml new file mode 100644 index 000000000..fb77473f7 --- /dev/null +++ b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml @@ -0,0 +1,44 @@ +libE_specs: + save_every_k_gens: 1000 + profile: False + exit_criteria: + sim_max: 8 + +sim_specs: + function: libensemble.tests.scaling_tests.balsam_forces.forces_simf.run_forces_balsam + inputs: + - x + outputs: + energy: + type: float + user: + keys: + - seed + sim_app: /home/jnavarro/libensemble/libensemble/tests/scaling_tests/forces/forces.x + remote_ensemble_dir: /home/jnavarro/bebop_output/ensemble_ + cores: 1 + sim_particles: 1.e+3 + sim_timesteps: 5 + sim_kill_minutes: 10.0 + particle_variance: 0.2 + kill_rate: 0.5 + fail_on_sim: False + fail_on_submit: False + +gen_specs: + function: libensemble.gen_funcs.sampling.uniform_random_sample + outputs: + x: + type: float + size: 1 + user: + gen_batch_size: 1000 + +alloc_specs: + function: libensemble.alloc_funcs.give_sim_work_first.give_sim_work_first + outputs: + allocated: + type: bool + user: + batch_mode: True + num_active_gens: 1 diff --git a/libensemble/tests/scaling_tests/balsam_forces/build_forces.sh b/libensemble/tests/scaling_tests/balsam_forces/build_forces.sh new file mode 100644 index 000000000..20b106ba4 --- /dev/null +++ b/libensemble/tests/scaling_tests/balsam_forces/build_forces.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +# Building flat MPI + +# GCC +mpicc -O3 -o forces.x ../forces/forces.c -lm + +# Intel +# mpiicc -O3 -o forces.x forces.c + +# Cray +# cc -O3 -o forces.x forces.c + +# ---------------------------------------------- + +# Building with OpenMP for CPU + +# GCC +# mpicc -O3 -fopenmp -o forces.x forces.c -lm + +# Intel +# mpiicc -O3 -qopenmp -o forces.x forces.c + +# Cray / Intel (for CCE OpenMP is recognized by default) +# cc -O3 -qopenmp -o forces.x forces.c + +# xl +# xlc_r -O3 -qsmp=omp -o forces.x forces.c + +# ---------------------------------------------- + +# Building with OpenMP for target device (e.g. GPU) +# Need to toggle to OpenMP target directive in forces.c. + +# xl +# xlc_r -O3 -qsmp=omp -qoffload -o forces.x forces.c + +# IRIS node (Intel Gen9 GPU) +# env MPICH_CC=icx mpigcc -g -fiopenmp -fopenmp-targets=spir64 -o forces.x forces.c diff --git a/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh b/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh new file mode 100644 index 000000000..54c41aa6e --- /dev/null +++ b/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh @@ -0,0 +1 @@ +rm -r ensemble_* *.npy *.pickle ensemble.log lib*.txt diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py new file mode 100644 index 000000000..66783b3f4 --- /dev/null +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -0,0 +1,114 @@ + +def run_forces_balsam(H, persis_info, sim_specs, libE_info): + + import os + import time + import secrets + import numpy as np + + from libensemble.executors.mpi_executor import MPIExecutor + from libensemble.message_numbers import WORKER_DONE, WORKER_KILL, TASK_FAILED + + class ForcesException(Exception): + """ Raised on some issue with Forces """ + + def perturb(particles, seed, max_fraction): + MAX_SEED = 32767 + """Modify particle count""" + seed_fraction = seed/MAX_SEED + max_delta = particles * max_fraction + delta = seed_fraction * max_delta + delta = delta - max_delta/2 # translate so -/+ + new_particles = particles + delta + return int(new_particles) + + def read_last_line(filepath): + """Read last line of statfile""" + try: + with open(filepath, 'rb') as fh: + line = fh.readlines()[-1].decode().rstrip() + except Exception: + line = "" # In case file is empty or not yet created + return line + + if sim_specs['user']['fail_on_sim']: + raise ForcesException(Exception) + + calc_status = 0 # Returns to worker + + x = H['x'] + sim_particles = sim_specs['user']['sim_particles'] + sim_timesteps = sim_specs['user']['sim_timesteps'] + time_limit = sim_specs['user']['sim_kill_minutes'] * 60.0 + sim_app = sim_specs['user']['sim_app'] + + exctr = MPIExecutor() + exctr.register_app(full_path=sim_app, app_name='forces') + + calc_dir = os.path.join(sim_specs['user']['remote_ensemble_dir'], secrets.token_hex(nbytes=4)) + os.makedirs(calc_dir, exist_ok=True) + os.chdir(calc_dir) + + # Get from dictionary if key exists, else return default (e.g. 0) + cores = sim_specs['user'].get('cores', None) + kill_rate = sim_specs['user'].get('kill_rate', 0) + particle_variance = sim_specs['user'].get('particle_variance', 0) + + # Composing variable names and x values to set up simulation + seed = int(np.rint(x[0][0])) + + # This is to give a random variance of work-load + sim_particles = perturb(sim_particles, seed, particle_variance) + print('seed: {} particles: {}'.format(seed, sim_particles)) + + args = str(int(sim_particles)) + ' ' + str(sim_timesteps) + ' ' + str(seed) + ' ' + str(kill_rate) + + task = exctr.submit(app_name='forces') # Auto-partition + + # Stat file to check for bad runs + statfile = 'forces.stat' + filepath = os.path.join(task.workdir, statfile) + line = None + + poll_interval = 1 # secs + while(not task.finished): + # Read last line of statfile + line = read_last_line(filepath) + if line == "kill": + task.kill() # Bad run + elif task.runtime > time_limit: + task.kill() # Timeout + else: + time.sleep(poll_interval) + task.poll() + + if task.finished: + if task.state == 'FINISHED': + print("Task {} completed".format(task.name)) + calc_status = WORKER_DONE + if read_last_line(filepath) == "kill": + # Generally mark as complete if want results (completed after poll - before readline) + print("Warning: Task completed although marked as a bad run (kill flag set in forces.stat)") + elif task.state == 'FAILED': + print("Warning: Task {} failed: Error code {}".format(task.name, task.errcode)) + calc_status = TASK_FAILED + elif task.state == 'USER_KILLED': + print("Warning: Task {} has been killed".format(task.name)) + calc_status = WORKER_KILL + else: + print("Warning: Task {} in unknown state {}. Error code {}".format(task.name, task.state, task.errcode)) + + time.sleep(0.2) + try: + data = np.loadtxt(filepath) + # task.read_file_in_workdir(statfile) + final_energy = data[-1] + except Exception: + final_energy = np.nan + # print('Warning - Energy Nan') + + outspecs = sim_specs['out'] + output = np.zeros(1, dtype=outspecs) + output['energy'][0] = final_energy + + return output, persis_info, calc_status diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/balsam_forces/readme.md new file mode 100644 index 000000000..6f3f804fe --- /dev/null +++ b/libensemble/tests/scaling_tests/balsam_forces/readme.md @@ -0,0 +1,55 @@ +## Running test run_libe_forces_funcx.py + +Naive Electostatics Code Test + +This is designed only as an artificial, highly configurable test +code for a libEnsemble sim func. This variant is primarily to test libEnsemble's +capability to submit simulation functions to a separate machine from where libEnsemble's +manager and workers are running. + +### Forces Mini-App + +A system of charged particles is set up and simulated over a number of time-steps. + +Particles position and charge are initiated by a random stream. +Particles are replicated on all ranks. +**Each rank** computes forces for a subset of particles (O(N^2)) +Particle force arrays are allreduced across ranks. +Particles are moved (replicated on each rank) +Total energy is appended to file forces.stat + +To run forces as a standalone executable on N procs: + + mpirun -np N ./forces.x + +This application will need to be built on the remote machine where the sims will run. +See below. + +### Running with libEnsemble. + +On the remote machine: + + pip install funcx-endpoint + funcx-endpoint configure forces + +Configure the endpoint's `config.py` to include your project information and +match the machine's specifications. +See [here](https://funcx.readthedocs.io/en/latest/endpoints.html#theta-alcf) for +an example ALCF Theta configuration. + +Then to run with local comms (multiprocessing) with one manager and `N` workers: + + python run_libe_forces_funcx.py --comms local --nworkers N + +To run with MPI comms using one manager and `N-1` workers: + + mpirun -np N python run_libe_forces.py + +Application parameters can be adjusted in `funcx_forces.yaml`. + +Note that each function and path must be accessible and/or importable on the +remote machine. Absolute paths are recommended. + +To remove output before the next run: + + ./cleanup.sh diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py new file mode 100644 index 000000000..ebc306dd0 --- /dev/null +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +import secrets +import numpy as np + +from libensemble import Ensemble +from libensemble.executors import NewBalsamMPIExecutor + +from balsam.api import ApplicationDefinition + +forces = Ensemble() +forces.from_yaml('balsam_forces.yaml') + +class RemoteForces(ApplicationDefinition): + site = 'libe-bebop' + command_template = './forces.x {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}' + +import ipdb; ipdb.set_trace() + +exctr = NewBalsamMPIExecutor() +exctr.register_app(RemoteForces, app_name='forces') + +forces.sim_specs['user']['remote_ensemble_dir'] += secrets.token_hex(nbytes=3) + +forces.gen_specs['user'].update({ + 'lb': np.array([0]), + 'ub': np.array([32767]) +}) + +forces.persis_info.add_random_streams() + +forces.run() From bb9c88f63699703fde3a5e2f23c68deb97aa263f Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 31 Jan 2022 17:27:13 -0600 Subject: [PATCH 05/93] update balsam simf, set Application to accept optional PyObj (balsam app def) --- libensemble/executors/executor.py | 4 +- libensemble/executors/new_balsam_executor.py | 59 +++++++++---------- .../scaling_tests/balsam_forces/cleanup.sh | 0 .../balsam_forces/forces_simf.py | 19 +++--- .../balsam_forces/run_libe_forces_balsam.py | 7 +-- 5 files changed, 43 insertions(+), 46 deletions(-) mode change 100644 => 100755 libensemble/tests/scaling_tests/balsam_forces/cleanup.sh diff --git a/libensemble/executors/executor.py b/libensemble/executors/executor.py index 071e90dca..500a17445 100644 --- a/libensemble/executors/executor.py +++ b/libensemble/executors/executor.py @@ -77,7 +77,7 @@ class Application: prefix = 'libe_app' - def __init__(self, full_path, name=None, calc_type='sim', desc=None, site=None): + def __init__(self, full_path, name=None, calc_type='sim', desc=None, pyobj=None): """Instantiates a new Application instance.""" self.full_path = full_path self.calc_type = calc_type @@ -86,8 +86,8 @@ def __init__(self, full_path, name=None, calc_type='sim', desc=None, site=None): if self.exe.endswith('.py'): self.full_path = ' '.join((sys.executable, full_path)) self.name = name or self.exe + self.pyobj = pyobj self.desc = desc or (self.exe + ' app') - self.site = site or None self.gname = '_'.join([Application.prefix, self.name]) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index d9a81a079..78142a723 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -23,7 +23,7 @@ Application, Task, ExecutorException, TimeoutExpired, jassert, STATES from libensemble.executors.mpi_executor import MPIExecutor -from balsam.api import ApplicationDefinition, BatchJob, EventLog +from balsam.api import ApplicationDefinition, BatchJob, EventLog, Job logger = logging.getLogger(__name__) # To change logging level for just this module @@ -111,7 +111,7 @@ def poll(self): return # Get current state of tasks from Balsam database - self.process.refresh_from_db() + # self.process.refresh_from_db() balsam_state = self.process.state self.runtime = self._get_time_since_balsam_submit() @@ -199,12 +199,12 @@ def __init__(self, custom_info={}): def serial_setup(self): """Balsam serial setup includes empyting database and adding applications""" - for app in self.apps.values(): - calc_name = app.gname - desc = app.desc - full_path = app.full_path - site = app.site - self.add_app(calc_name, site, full_path, desc) + # for app in self.apps.values(): + # calc_name = app.gname + # desc = app.desc + # full_path = app.full_path + # site = app.site + # self.add_app(calc_name, site, full_path, desc) def add_app(self, name, site, exepath, desc): """ Sync application with balsam service """ @@ -214,7 +214,7 @@ def add_app(self, name, site, exepath, desc): libEBalsamApplication.sync() logger.debug("Added App {}".format(name)) - def register_app(self, balsam_app, app_name=None, calc_type=None, desc=None): + def register_app(self, BalsamApp, app_name, calc_type=None, desc=None): """Registers a Balsam application instance to libEnsemble. The ``full_path`` of the application must be supplied. Either @@ -240,8 +240,8 @@ def register_app(self, balsam_app, app_name=None, calc_type=None, desc=None): """ if not app_name: - app_name = os.path.split(full_path)[1] - self.apps[app_name] = Application(full_path, app_name, calc_type, desc, site) + app_name = BalsamApp.command_template.split(" ")[0] + self.apps[app_name] = Application(" ", app_name, calc_type, desc, BalsamApp) # Default sim/gen apps will be deprecated. Just use names. if calc_type is not None: @@ -252,11 +252,11 @@ def register_app(self, balsam_app, app_name=None, calc_type=None, desc=None): def set_resources(self, resources): self.resources = resources - def submit(self, calc_type=None, app_name=None, num_procs=None, - num_nodes=None, procs_per_node=None, machinefile=None, - app_args=None, stdout=None, stderr=None, stage_inout=None, - hyperthreads=False, gpus_per_rank=0, dry_run=False, wait_on_start=False, - queue=None, project=None, wall_time_min=None, extra_args=''): + def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, + num_nodes=None, procs_per_node=None, tasks_per_node=None, + machinefile=None, stdout=None, stderr=None, + stage_inout=None, gpus_per_rank=0, dry_run=False, wait_on_start=False, + queue=None, project=None, wall_time_min=None, extra_args={}): """Creates a new task, and either executes or schedules to execute in the executor @@ -291,26 +291,23 @@ def submit(self, calc_type=None, app_name=None, num_procs=None, task = BalsamTask(app, app_args, default_workdir, stdout, stderr, self.workerID) - add_task_args = {'name': task.name, - 'workflow': self.workflow_name, - 'user_workdir': default_workdir, - 'application': app.gname, - 'args': task.app_args, - 'num_nodes': num_nodes, - 'procs_per_node': procs_per_node, - 'mpi_flags': extra_args} - if dry_run: task.dry_run = True logger.info('Test (No submit) to Balsam: {}'.format(' '.join(add_task_args))) task._set_complete(dry_run=True) else: - - task.process = libEBalsamApplication.submit(workdir=self.workflow_name, - num_nodes=num_nodes, - ranks_per_node=procs_per_node, - wall_time_min=wall_time_min, - gpus_per_rank=gpus_per_rank) + App = app.pyobj + App.sync() + task.process = Job(app_id=App, workdir=self.workflow_name, + parameters=app_args, + num_nodes=num_nodes, + ranks_per_node=procs_per_node, + launch_params=extra_args, + gpus_per_rank=gpus_per_rank, + node_packing_count=tasks_per_node, + wall_time_min=wall_time_min) + + task.process.save() task.batchjob = BatchJob.objects.create( site_id=task.process.site_id, diff --git a/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh b/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh old mode 100644 new mode 100755 diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index 66783b3f4..ccca4cc14 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -6,7 +6,7 @@ def run_forces_balsam(H, persis_info, sim_specs, libE_info): import secrets import numpy as np - from libensemble.executors.mpi_executor import MPIExecutor + from libensemble.executors.executor import Executor from libensemble.message_numbers import WORKER_DONE, WORKER_KILL, TASK_FAILED class ForcesException(Exception): @@ -36,18 +36,17 @@ def read_last_line(filepath): calc_status = 0 # Returns to worker + exctr = Executor.executor + x = H['x'] sim_particles = sim_specs['user']['sim_particles'] sim_timesteps = sim_specs['user']['sim_timesteps'] time_limit = sim_specs['user']['sim_kill_minutes'] * 60.0 sim_app = sim_specs['user']['sim_app'] - exctr = MPIExecutor() - exctr.register_app(full_path=sim_app, app_name='forces') - - calc_dir = os.path.join(sim_specs['user']['remote_ensemble_dir'], secrets.token_hex(nbytes=4)) - os.makedirs(calc_dir, exist_ok=True) - os.chdir(calc_dir) + # calc_dir = os.path.join(sim_specs['user']['remote_ensemble_dir'], secrets.token_hex(nbytes=4)) + # os.makedirs(calc_dir, exist_ok=True) + # os.chdir(calc_dir) # Get from dictionary if key exists, else return default (e.g. 0) cores = sim_specs['user'].get('cores', None) @@ -61,9 +60,11 @@ def read_last_line(filepath): sim_particles = perturb(sim_particles, seed, particle_variance) print('seed: {} particles: {}'.format(seed, sim_particles)) - args = str(int(sim_particles)) + ' ' + str(sim_timesteps) + ' ' + str(seed) + ' ' + str(kill_rate) + args = {"sim_particles": sim_particles, "sim_timesteps": sim_timesteps, "seed": seed, "kill_rate": kill_rate} - task = exctr.submit(app_name='forces') # Auto-partition + task = exctr.submit(app_name='forces', app_args=args, + num_procs=64, num_nodes=1, procs_per_node=64, tasks_per_node=1, + queue="debug-cache-quad", project="CSC250STMS07", wall_time_min=15) # Auto-partition # Stat file to check for bad runs statfile = 'forces.stat' diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index ebc306dd0..7144ee910 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -11,10 +11,9 @@ forces.from_yaml('balsam_forces.yaml') class RemoteForces(ApplicationDefinition): - site = 'libe-bebop' - command_template = './forces.x {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}' - -import ipdb; ipdb.set_trace() + site = 'jln_theta' + command_template = '/home/jnavarro/libensemble/libensemble/tests/scaling_tests/forces/forces.x' + \ + '{{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}' exctr = NewBalsamMPIExecutor() exctr.register_app(RemoteForces, app_name='forces') From 47839febe20ef5834114c81d96147b69712abec3 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 1 Feb 2022 17:04:01 -0600 Subject: [PATCH 06/93] rename parameter, fix spacing in command template --- libensemble/executors/new_balsam_executor.py | 4 ++-- libensemble/tests/scaling_tests/balsam_forces/forces_simf.py | 4 ++-- .../scaling_tests/balsam_forces/run_libe_forces_balsam.py | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 78142a723..f1bfc3783 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -253,7 +253,7 @@ def set_resources(self, resources): self.resources = resources def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, - num_nodes=None, procs_per_node=None, tasks_per_node=None, + num_nodes=None, procs_per_node=None, max_tasks_per_node=None, machinefile=None, stdout=None, stderr=None, stage_inout=None, gpus_per_rank=0, dry_run=False, wait_on_start=False, queue=None, project=None, wall_time_min=None, extra_args={}): @@ -304,7 +304,7 @@ def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, ranks_per_node=procs_per_node, launch_params=extra_args, gpus_per_rank=gpus_per_rank, - node_packing_count=tasks_per_node, + node_packing_count=max_tasks_per_node, wall_time_min=wall_time_min) task.process.save() diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index ccca4cc14..e69b595f2 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -63,8 +63,8 @@ def read_last_line(filepath): args = {"sim_particles": sim_particles, "sim_timesteps": sim_timesteps, "seed": seed, "kill_rate": kill_rate} task = exctr.submit(app_name='forces', app_args=args, - num_procs=64, num_nodes=1, procs_per_node=64, tasks_per_node=1, - queue="debug-cache-quad", project="CSC250STMS07", wall_time_min=15) # Auto-partition + num_procs=64, num_nodes=1, procs_per_node=64, max_tasks_per_node=1, + queue="debug-cache-quad", project="CSC250STMS07", wall_time_min=30) # Auto-partition # Stat file to check for bad runs statfile = 'forces.stat' diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 7144ee910..3641d10ee 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -13,7 +13,8 @@ class RemoteForces(ApplicationDefinition): site = 'jln_theta' command_template = '/home/jnavarro/libensemble/libensemble/tests/scaling_tests/forces/forces.x' + \ - '{{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}' + ' {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}' + \ + ' > out.txt 2>&1' exctr = NewBalsamMPIExecutor() exctr.register_app(RemoteForces, app_name='forces') From 4f40ae8b8a748662e030c4ce67d2f938a3acc0e5 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 7 Feb 2022 13:06:46 -0600 Subject: [PATCH 07/93] refactor, add BalsamExecutor.submit_allocation() to reserve resources from a Balsam site --- libensemble/executors/new_balsam_executor.py | 50 +++++++++++-------- .../balsam_forces/forces_simf.py | 9 +--- .../balsam_forces/run_libe_forces_balsam.py | 5 +- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index f1bfc3783..bf8dc31c1 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -194,10 +194,11 @@ def __init__(self, custom_info={}): super().__init__(custom_info) self.workflow_name = "libe_workflow" - self.application_objs = {} + self.allocations = [] def serial_setup(self): """Balsam serial setup includes empyting database and adding applications""" + pass # for app in self.apps.values(): # calc_name = app.gname @@ -208,10 +209,7 @@ def serial_setup(self): def add_app(self, name, site, exepath, desc): """ Sync application with balsam service """ - - libEBalsamApplication.site = site - libEBalsamApplication.command_template = exepath - libEBalsamApplication.sync() + pass logger.debug("Added App {}".format(name)) def register_app(self, BalsamApp, app_name, calc_type=None, desc=None): @@ -249,14 +247,35 @@ def register_app(self, BalsamApp, app_name, calc_type=None, desc=None): "Unrecognized calculation type", calc_type) self.default_apps[calc_type] = self.apps[app_name] + def submit_allocation(site_id, num_nodes, wall_time_min, job_mode="mpi", + queue="local", project="local"): + """ + Submits a Balsam BatchJob machine allocation request to Balsam. + Corresponding Balsam applications with a matching site can be submitted to this allocation. + """ + + self.allocations.append( + BatchJob.objects.create( + site_id=site_id, + num_nodes=num_nodes, + wall_time_min=wall_time_min, + job_mode=job_mode, + queue=queue, + project=project + ) + ) + + logger.info("Submitted Batch allocation to endpoint {}: " + "nodes {} queue {} project {}". + format(site_id, num_nodes, queue, project)) + def set_resources(self, resources): self.resources = resources def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, num_nodes=None, procs_per_node=None, max_tasks_per_node=None, - machinefile=None, stdout=None, stderr=None, - stage_inout=None, gpus_per_rank=0, dry_run=False, wait_on_start=False, - queue=None, project=None, wall_time_min=None, extra_args={}): + machinefile=None, stdout=None, stderr=None, gpus_per_rank=0, transfers={}, + dry_run=False, wait_on_start=False, extra_args={}): """Creates a new task, and either executes or schedules to execute in the executor @@ -305,19 +324,10 @@ def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, launch_params=extra_args, gpus_per_rank=gpus_per_rank, node_packing_count=max_tasks_per_node, - wall_time_min=wall_time_min) + transfers=transfers) task.process.save() - task.batchjob = BatchJob.objects.create( - site_id=task.process.site_id, - num_nodes=num_nodes, - wall_time_min=wall_time_min, - job_mode="mpi", - queue=queue, - project=project - ) - if (wait_on_start): self._wait_on_start(task) @@ -325,9 +335,9 @@ def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, task.timer.start() task.submit_time = task.timer.tstart # Time not date - may not need if using timer. - logger.info("Added task to Balsam database {}: " + logger.info("Submitted Balsam App to endpoint {}: " "nodes {} ppn {}". - format(task.name, num_nodes, procs_per_node)) + format(App.site, num_nodes, procs_per_node)) # task.workdir = task.process.working_directory # Might not be set yet! self.list_of_tasks.append(task) diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index e69b595f2..76867f071 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -44,10 +44,6 @@ def read_last_line(filepath): time_limit = sim_specs['user']['sim_kill_minutes'] * 60.0 sim_app = sim_specs['user']['sim_app'] - # calc_dir = os.path.join(sim_specs['user']['remote_ensemble_dir'], secrets.token_hex(nbytes=4)) - # os.makedirs(calc_dir, exist_ok=True) - # os.chdir(calc_dir) - # Get from dictionary if key exists, else return default (e.g. 0) cores = sim_specs['user'].get('cores', None) kill_rate = sim_specs['user'].get('kill_rate', 0) @@ -62,9 +58,8 @@ def read_last_line(filepath): args = {"sim_particles": sim_particles, "sim_timesteps": sim_timesteps, "seed": seed, "kill_rate": kill_rate} - task = exctr.submit(app_name='forces', app_args=args, - num_procs=64, num_nodes=1, procs_per_node=64, max_tasks_per_node=1, - queue="debug-cache-quad", project="CSC250STMS07", wall_time_min=30) # Auto-partition + task = exctr.submit(app_name='forces', app_args=args, num_procs=64, num_nodes=1, + procs_per_node=64, max_tasks_per_node=1) # Stat file to check for bad runs statfile = 'forces.stat' diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 3641d10ee..3aaef15bb 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -import secrets import numpy as np from libensemble import Ensemble @@ -17,10 +16,10 @@ class RemoteForces(ApplicationDefinition): ' > out.txt 2>&1' exctr = NewBalsamMPIExecutor() +exctr.submit_allocation(site_id='jln_theta', num_nodes=8, wall_time_min=30, + queue='debug-cache-quad', project='CSC250STMS07') exctr.register_app(RemoteForces, app_name='forces') -forces.sim_specs['user']['remote_ensemble_dir'] += secrets.token_hex(nbytes=3) - forces.gen_specs['user'].update({ 'lb': np.array([0]), 'ub': np.array([32767]) From bd1d3fd056f2e959b9fa6e20f1fc3650de9d13ee Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 7 Feb 2022 13:36:51 -0600 Subject: [PATCH 08/93] flake8 --- libensemble/executors/__init__.py | 2 -- libensemble/executors/new_balsam_executor.py | 15 ++++++++----- .../balsam_forces/balsam_forces.yaml | 3 --- .../balsam_forces/build_forces.sh | 0 .../balsam_forces/forces_simf.py | 4 +--- .../balsam_forces/run_libe_forces_balsam.py | 16 ++++++++------ .../tests/scaling_tests/forces/forces_simf.py | 21 ++++++++----------- 7 files changed, 30 insertions(+), 31 deletions(-) mode change 100644 => 100755 libensemble/tests/scaling_tests/balsam_forces/build_forces.sh diff --git a/libensemble/executors/__init__.py b/libensemble/executors/__init__.py index 850484c36..98b9175cd 100644 --- a/libensemble/executors/__init__.py +++ b/libensemble/executors/__init__.py @@ -7,6 +7,4 @@ if 'BALSAM_DB_PATH' in os.environ and int(sys.version[2]) >= 6: from libensemble.executors.balsam_executor import BalsamMPIExecutor - - __all__ = ['BalsamMPIExecutor', 'Executor', 'MPIExecutor', 'NewBalsamMPIExecutor'] diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index bf8dc31c1..5e31cb446 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -23,7 +23,7 @@ Application, Task, ExecutorException, TimeoutExpired, jassert, STATES from libensemble.executors.mpi_executor import MPIExecutor -from balsam.api import ApplicationDefinition, BatchJob, EventLog, Job +from balsam.api import BatchJob, EventLog, Job logger = logging.getLogger(__name__) # To change logging level for just this module @@ -247,7 +247,7 @@ def register_app(self, BalsamApp, app_name, calc_type=None, desc=None): "Unrecognized calculation type", calc_type) self.default_apps[calc_type] = self.apps[app_name] - def submit_allocation(site_id, num_nodes, wall_time_min, job_mode="mpi", + def submit_allocation(self, site_id, num_nodes, wall_time_min, job_mode="mpi", queue="local", project="local"): """ Submits a Balsam BatchJob machine allocation request to Balsam. @@ -275,7 +275,7 @@ def set_resources(self, resources): def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, num_nodes=None, procs_per_node=None, max_tasks_per_node=None, machinefile=None, stdout=None, stderr=None, gpus_per_rank=0, transfers={}, - dry_run=False, wait_on_start=False, extra_args={}): + workdir='', dry_run=False, wait_on_start=False, extra_args={}): """Creates a new task, and either executes or schedules to execute in the executor @@ -289,6 +289,11 @@ def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, else: raise ExecutorException("Either app_name or calc_type must be set") + if len(workdir): + workdir = os.path.join(self.workflow_name, workdir) + else: + workdir = self.workflow_name + # Specific to this class if machinefile is not None: logger.warning("machinefile arg ignored - not supported in Balsam") @@ -312,12 +317,12 @@ def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, if dry_run: task.dry_run = True - logger.info('Test (No submit) to Balsam: {}'.format(' '.join(add_task_args))) + logger.info('Test (No submit) Balsam app {}'.format(app_name)) task._set_complete(dry_run=True) else: App = app.pyobj App.sync() - task.process = Job(app_id=App, workdir=self.workflow_name, + task.process = Job(app_id=App, workdir=workdir, parameters=app_args, num_nodes=num_nodes, ranks_per_node=procs_per_node, diff --git a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml index fb77473f7..5660b6d68 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml +++ b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml @@ -14,9 +14,6 @@ sim_specs: user: keys: - seed - sim_app: /home/jnavarro/libensemble/libensemble/tests/scaling_tests/forces/forces.x - remote_ensemble_dir: /home/jnavarro/bebop_output/ensemble_ - cores: 1 sim_particles: 1.e+3 sim_timesteps: 5 sim_kill_minutes: 10.0 diff --git a/libensemble/tests/scaling_tests/balsam_forces/build_forces.sh b/libensemble/tests/scaling_tests/balsam_forces/build_forces.sh old mode 100644 new mode 100755 diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index 76867f071..ee2e17808 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -42,10 +42,8 @@ def read_last_line(filepath): sim_particles = sim_specs['user']['sim_particles'] sim_timesteps = sim_specs['user']['sim_timesteps'] time_limit = sim_specs['user']['sim_kill_minutes'] * 60.0 - sim_app = sim_specs['user']['sim_app'] # Get from dictionary if key exists, else return default (e.g. 0) - cores = sim_specs['user'].get('cores', None) kill_rate = sim_specs['user'].get('kill_rate', 0) particle_variance = sim_specs['user'].get('particle_variance', 0) @@ -59,7 +57,7 @@ def read_last_line(filepath): args = {"sim_particles": sim_particles, "sim_timesteps": sim_timesteps, "seed": seed, "kill_rate": kill_rate} task = exctr.submit(app_name='forces', app_args=args, num_procs=64, num_nodes=1, - procs_per_node=64, max_tasks_per_node=1) + procs_per_node=64, max_tasks_per_node=1, workdir=secrets.token_hex(nbytes=3)) # Stat file to check for bad runs statfile = 'forces.stat' diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 3aaef15bb..1d0a66df9 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -9,15 +9,19 @@ forces = Ensemble() forces.from_yaml('balsam_forces.yaml') + class RemoteForces(ApplicationDefinition): - site = 'jln_theta' - command_template = '/home/jnavarro/libensemble/libensemble/tests/scaling_tests/forces/forces.x' + \ - ' {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}' + \ - ' > out.txt 2>&1' + site = 'three' + command_template = \ + '/Users/jnavarro/Desktop/libensemble/' + \ + 'libensemble/libensemble/tests/scaling_tests/balsam_forces/forces.x' + \ + ' {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}' + \ + ' > out.txt 2>&1' + exctr = NewBalsamMPIExecutor() -exctr.submit_allocation(site_id='jln_theta', num_nodes=8, wall_time_min=30, - queue='debug-cache-quad', project='CSC250STMS07') +exctr.submit_allocation(site_id='three', num_nodes=1, wall_time_min=30, + queue='local', project='local') exctr.register_app(RemoteForces, app_name='forces') forces.gen_specs['user'].update({ diff --git a/libensemble/tests/scaling_tests/forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/forces_simf.py index 2a89523d2..3dbb350bb 100644 --- a/libensemble/tests/scaling_tests/forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/forces/forces_simf.py @@ -78,18 +78,15 @@ def run_forces(H, persis_info, sim_specs, libE_info): if sim_specs['user']['fail_on_submit']: machinefile = 'fail' - # # Machinefile only used here for exception testing - # if cores: - # task = exctr.submit(app_name='forces', num_procs=cores, app_args=args, - # stdout='out.txt', stderr='err.txt', wait_on_start=True, - # machinefile=machinefile) - # else: - # task = exctr.submit(app_name='forces', app_args=args, stdout='out.txt', - # stderr='err.txt', wait_on_start=True, hyperthreads=True, - # machinefile=machinefile) # Auto-partition - - task = exctr.submit(app_name='forces', num_procs=cores, app_args=args, wait_on_start=True, - queue='local', project='local', wall_time_min=10) # Auto-partition + # Machinefile only used here for exception testing + if cores: + task = exctr.submit(app_name='forces', num_procs=cores, app_args=args, + stdout='out.txt', stderr='err.txt', wait_on_start=True, + machinefile=machinefile) + else: + task = exctr.submit(app_name='forces', app_args=args, stdout='out.txt', + stderr='err.txt', wait_on_start=True, hyperthreads=True, + machinefile=machinefile) # Auto-partition # Stat file to check for bad runs statfile = 'forces.stat' From 38cfdac515e4ec4ef0dfe9067ddad07554ad8c6a Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 7 Feb 2022 17:11:21 -0600 Subject: [PATCH 09/93] refactoring, use local site, return 0 runtime if EventLog query is empty --- libensemble/executors/new_balsam_executor.py | 25 ++++++------------- .../balsam_forces/balsam_forces.yaml | 1 + .../balsam_forces/forces_simf.py | 7 +++--- .../balsam_forces/run_libe_forces_balsam.py | 4 +-- 4 files changed, 15 insertions(+), 22 deletions(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 5e31cb446..c8785c1cf 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -23,7 +23,7 @@ Application, Task, ExecutorException, TimeoutExpired, jassert, STATES from libensemble.executors.mpi_executor import MPIExecutor -from balsam.api import BatchJob, EventLog, Job +from balsam.api import Job, BatchJob, EventLog logger = logging.getLogger(__name__) # To change logging level for just this module @@ -56,8 +56,11 @@ def _get_time_since_balsam_submit(self): # self.runtime = self.process.runtime_seconds # Only reports at end of run currently # balsam_launch_datetime = self.process.get_state_times().get('RUNNING', None) - balsam_launch_datetime = EventLog.objects.filter( + event_query = EventLog.objects.filter( job_id=self.process.id, to_state="RUNNING") + if not len(event_query): + return 0 + balsam_launch_datetime = event_query[0].timestamp current_datetime = datetime.datetime.now() if balsam_launch_datetime: return (current_datetime - balsam_launch_datetime).total_seconds() @@ -274,7 +277,7 @@ def set_resources(self, resources): def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, num_nodes=None, procs_per_node=None, max_tasks_per_node=None, - machinefile=None, stdout=None, stderr=None, gpus_per_rank=0, transfers={}, + machinefile=None, gpus_per_rank=0, transfers={}, workdir='', dry_run=False, wait_on_start=False, extra_args={}): """Creates a new task, and either executes or schedules to execute in the executor @@ -300,20 +303,8 @@ def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, jassert(num_procs or num_nodes or procs_per_node, "No procs/nodes provided - aborting") - num_procs, num_nodes, procs_per_node = \ - mpi_resources.task_partition(num_procs, num_nodes, procs_per_node) - - if stdout is not None or stderr is not None: - logger.warning("Balsam does not currently accept a stdout " - "or stderr name - ignoring") - stdout = None - stderr = None - - # Will be possible to override with arg when implemented - # (or can have option to let Balsam assign) - default_workdir = os.getcwd() - task = BalsamTask(app, app_args, default_workdir, - stdout, stderr, self.workerID) + task = BalsamTask(app, app_args, workdir, + None, None, self.workerID) if dry_run: task.dry_run = True diff --git a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml index 5660b6d68..0e62f9af5 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml +++ b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml @@ -14,6 +14,7 @@ sim_specs: user: keys: - seed + balsam_data_dir: /Users/jnavarro/Desktop/libensemble/newbalsam/sites/three/data/ sim_particles: 1.e+3 sim_timesteps: 5 sim_kill_minutes: 10.0 diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index ee2e17808..237f560e2 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -55,13 +55,14 @@ def read_last_line(filepath): print('seed: {} particles: {}'.format(seed, sim_particles)) args = {"sim_particles": sim_particles, "sim_timesteps": sim_timesteps, "seed": seed, "kill_rate": kill_rate} + workdir = 'worker' + str(libE_info['workerID']) + '_' + secrets.token_urlsafe(nbytes=3) - task = exctr.submit(app_name='forces', app_args=args, num_procs=64, num_nodes=1, - procs_per_node=64, max_tasks_per_node=1, workdir=secrets.token_hex(nbytes=3)) + task = exctr.submit(app_name='forces', app_args=args, num_procs=8, num_nodes=1, + procs_per_node=8, max_tasks_per_node=4, workdir=workdir) # Stat file to check for bad runs statfile = 'forces.stat' - filepath = os.path.join(task.workdir, statfile) + filepath = sim_specs['user']['balsam_data_dir'] + os.path.join(task.workdir, statfile) line = None poll_interval = 1 # secs diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 1d0a66df9..e16100527 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -20,9 +20,9 @@ class RemoteForces(ApplicationDefinition): exctr = NewBalsamMPIExecutor() -exctr.submit_allocation(site_id='three', num_nodes=1, wall_time_min=30, - queue='local', project='local') exctr.register_app(RemoteForces, app_name='forces') +exctr.submit_allocation(site_id=239, num_nodes=1, wall_time_min=30, + queue='local', project='local') forces.gen_specs['user'].update({ 'lb': np.array([0]), From 604106eaf0c8397d82329b0427a85f62ea78ec1b Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 7 Feb 2022 17:23:50 -0600 Subject: [PATCH 10/93] sadly, mpiresources unused in new executor? --- libensemble/executors/new_balsam_executor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index c8785c1cf..62608ca7d 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -18,7 +18,6 @@ import time import datetime -from libensemble.resources import mpi_resources from libensemble.executors.executor import \ Application, Task, ExecutorException, TimeoutExpired, jassert, STATES from libensemble.executors.mpi_executor import MPIExecutor From b25c33842e16f9566320b6968de8cb64c4152098 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 8 Feb 2022 08:06:13 -0600 Subject: [PATCH 11/93] Whitespace --- .github/workflows/ci.yml | 1 - docs/examples/sim_funcs.rst | 1 - .../balsam_forces/forces_simf.py | 1 - .../scaling_tests/funcx_forces/forces_simf.py | 1 - tex/images/diagram.tex | 20 ++--- tex/planning_doc/planning_doc.tex | 90 +++++++++---------- 6 files changed, 52 insertions(+), 62 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3b13d58be..a43edf48e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -202,7 +202,6 @@ jobs: mv libensemble/tests/.cov* . coveralls --service=github - coveralls: name: Notify coveralls of all jobs completing needs: [test-libE] diff --git a/docs/examples/sim_funcs.rst b/docs/examples/sim_funcs.rst index 01d613295..462f5e1c3 100644 --- a/docs/examples/sim_funcs.rst +++ b/docs/examples/sim_funcs.rst @@ -22,7 +22,6 @@ chwirut :members: :undoc-members: - noisy_vector_mapping -------------------- .. automodule:: noisy_vector_mapping diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index 237f560e2..b0420f2ce 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -1,4 +1,3 @@ - def run_forces_balsam(H, persis_info, sim_specs, libE_info): import os diff --git a/libensemble/tests/scaling_tests/funcx_forces/forces_simf.py b/libensemble/tests/scaling_tests/funcx_forces/forces_simf.py index 108ce0cac..783c044ba 100644 --- a/libensemble/tests/scaling_tests/funcx_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/funcx_forces/forces_simf.py @@ -1,4 +1,3 @@ - def run_forces_funcx(H, persis_info, sim_specs, libE_info): import os diff --git a/tex/images/diagram.tex b/tex/images/diagram.tex index 1ee744b0c..eeeb3c11a 100644 --- a/tex/images/diagram.tex +++ b/tex/images/diagram.tex @@ -1,4 +1,4 @@ -\documentclass[tikz,border=1pt,convert={density=300,size=1080x800,outext=.png}]{standalone} +\documentclass[tikz,border=1pt,convert={density=300,size=1080x800,outext=.png}]{standalone} \usepackage{pgfplots,tikz} \usetikzlibrary{shapes,arrows,decorations.markings,shadows,positioning} \begin{document} @@ -27,7 +27,6 @@ \tikzstyle{completedgenbox} = [box, very thick, fill=ArgonneLogoBlue!50] \tikzstyle{completedsimbox} = [box, very thick, fill=ArgonneLogoGreen!50] - \tikzstyle{linea} = [draw, thick, shorten >=1pt, shorten <=1pt, >=latex] % Define distances @@ -41,18 +40,17 @@ \node (mantitle) [textbox] {libEnsemble Manager}; \path (mantitle.east)+(4.0,-0.2) node (workertitle) [textbox] {Workers}; - % Make the manager contents + % Make the manager contents \node [redshape,below=0.2 of mantitle] (recv) {Receive}; \node [draw, below=\spaceinmanager of recv, redshape] (update) {Update active and queue}; \node [draw, below=\spaceinmanager of update, redshape] (decide) {Decide work\\and resources}; \node (give) [redshape, below=\spaceinmanager of decide] {Give work}; - % Make worker boxes \path (workertitle.south)+(0,-\spaceinworkers) node (worker1) [activesimbox] {active simulation}; - \path (worker1.south)+(0,-\spaceinworkers) node (worker2) [completedgenbox] {completed generation}; + \path (worker1.south)+(0,-\spaceinworkers) node (worker2) [completedgenbox] {completed generation}; \path (worker2.south)+(0,-\spaceinworkers) node (worker3) [activesimbox] {active simulation}; - \path (worker3.south)+(0,-\spaceinworkers) node (dots2) [textbox] {$\vdots$}; + \path (worker3.south)+(0,-\spaceinworkers) node (dots2) [textbox] {$\vdots$}; \path (dots2.south)+(0,-\spaceinworkers) node (worker4) [completedsimbox] {completed simulation}; % Make lines in manager @@ -68,22 +66,22 @@ % Bottom title % \path (give.south) + (0,-1) node (bottomlabel) {(A)POSMM Diagram}; - % Manager background + % Manager background \begin{pgfonlayer}{background} \path (recv.west |- recv.north)+(-0.2,0.2) node (a) {}; \path (give.east |- give.south)+(0.2,-0.2) node (c) {}; - + \path[fill=ArgonneLogoRed!20, rounded corners, draw=black, very thick] - (a) rectangle (c); + (a) rectangle (c); \end{pgfonlayer} % Worker's background \begin{pgfonlayer}{background} \path (worker1.west |- mantitle.north)+(-0.2,0.0) node (a2) {}; \path (worker4.east |- c)+(0.2,0.0) node (c2) {}; - + \path[fill=black!10] - (a2) rectangle (c2); + (a2) rectangle (c2); \end{pgfonlayer} \end{tikzpicture} \end{document} diff --git a/tex/planning_doc/planning_doc.tex b/tex/planning_doc/planning_doc.tex index edd35c318..e04f773d5 100644 --- a/tex/planning_doc/planning_doc.tex +++ b/tex/planning_doc/planning_doc.tex @@ -24,10 +24,10 @@ \libE. Proper planning should help ensure that the code will be flexible and easy to adjust in the future. - Current development considers a manager and worker framework. + Current development considers a manager and worker framework. \end{abstract} -In order to ensure applicability to a variety of use cases, +In order to ensure applicability to a variety of use cases, \libE will coordinate many different types of calculations. Examples of such calculations include: \begin{itemize} @@ -38,11 +38,11 @@ \item Monitoring intermediate output from simulations \end{itemize} Users will only need to provide scripts for simulation evaluation and point -generation. We will provide default scripts for the other types of calculations, +generation. We will provide default scripts for the other types of calculations, which can be templates for the interested user. As a default the Manager will perform many calculations itself (essentially blocking the manager), but we can monitor this in the future and adjust as needed if manager-calculations are -expensive. +expensive. We outline the behavior of the manager and worker within \libE: @@ -53,7 +53,7 @@ \item If the calculation is a simulation, determine parameters to be evaluated. \item If the calculation is a local optimization run, give state - information for determining the next point in a run. + information for determining the next point in a run. \item If the calculation is deciding where to start a run, then give all evaluated (and about-to-be-evaluated) points. \end{itemize} @@ -65,13 +65,13 @@ \begin{itemize} \item Possibly receive intermediate output \end{itemize} - \item Coordinates concurrent calculations + \item Coordinates concurrent calculations \begin{itemize} \item Dynamic queue of pending calculations \item Possibly monitor sims \end{itemize} \item Tracks history of calculations - \item Allocated resource to calculations + \item Allocated resource to calculations \begin{itemize} \item Possibly changes resources during calculations (or just simulations) \end{itemize} @@ -89,13 +89,12 @@ \end{itemize} \end{itemize} - \noindent \textbf{Worker} \begin{itemize} \item Performs the calculations given to it. \item An undivisible unit (though many workers may combine to perform one task). Examples: \begin{itemize} - \item If the simulation is an MPI executable, one worker may call + \item If the simulation is an MPI executable, one worker may call \begin{center} \texttt{mpiexec -np 32 -machinefile two\_nodes a.out} \end{center} @@ -105,13 +104,12 @@ \end{itemize} \end{itemize} - \section{Pseudocode} We outline the logic of the the manager and the workers in \algref{manager} and \algref{worker}, respectively. \LinesNumbered -\begin{algorithm}[H] +\begin{algorithm}[H] \SetKwComment{Comment}{$\triangleright$\ }{} \SetAlgoNlRelativeSize{-5} \SetKwInOut{Input}{input} @@ -131,7 +129,7 @@ \section{Pseudocode} \While{$\mathtt{term\_test}(H)$} { - \While{Any worker is waiting to return $\mathtt{sim}$ or $\mathtt{gen}$ results} + \While{Any worker is waiting to return $\mathtt{sim}$ or $\mathtt{gen}$ results} { Receive from all workers with $\mathtt{sim}$ and $\mathtt{gen}$ work\\ } @@ -149,9 +147,9 @@ \section{Pseudocode} } } } - Receive from active workers\\ + Receive from active workers\\ Terminate all workers - \caption{\libE manager logic \label{alg:manager}} + \caption{\libE manager logic \label{alg:manager}} \end{algorithm} \LinesNumbered @@ -167,7 +165,7 @@ \section{Pseudocode} \SetKw{break}{break} \While{\true} { - $D =$ Receive from manager \\ + $D =$ Receive from manager \\ \If{$D.tag == \mathtt{stop\_tag}$ }{\break} \If{$D.form\_subcomm$ is nonempty}{Form subcommunicator $sc$ with other workers} @@ -177,9 +175,9 @@ \section{Pseudocode} $O = D.calc\_f(sc, D.calc\_params)$ \If{First element in subcommunicator $sc$}{Report $O$ to manager} - + } - \caption{Each \libE worker's logic \label{alg:worker}} + \caption{Each \libE worker's logic \label{alg:worker}} \end{algorithm} \section{API} @@ -191,7 +189,7 @@ \section{API} % history, \begin{description} - \item[c]: [dict] + \item[c]: [dict] \begin{description} \item['comm']: [mpi4py communicator] to be used by libE \item['color']: [int] communicator color @@ -199,12 +197,12 @@ \section{API} \item[allocation\_specs]: [dict] \begin{description} - \item[manager\_ranks]: [set of ints] + \item[manager\_ranks]: [set of ints] \item[worker\_ranks]: [set of ints] \item[machinefile]: \end{description} - \item[sim\_specs]: [dict] + \item[sim\_specs]: [dict] \begin{description} \item[f]: [list of funcs] that calls each sim(s) \item[in]: [list] string keys that sim wants from history (assumed common to all sims) @@ -212,13 +210,13 @@ \section{API} \item[params]: [list of dicts] parameters for each f \end{description} - \item[gen\_specs]: [dict] + \item[gen\_specs]: [dict] \begin{description} \item[f]: [func] generates points to be evaluated by a sim \item[in]: [list] string keys that gen wants from history \item[out]: [list of tuples] (string keys, type, [size>1]) of gen outputs - \item[params]: [dict] additional parameters for gen\_f. - % E.g.: + \item[params]: [dict] additional parameters for gen\_f. + % E.g.: % \begin{itemize} % \item[lb]: [n-by-1 array] lower bound on sim parameters % \item[ub]: [n-by-1 array] upper bound on sim parameters @@ -233,7 +231,7 @@ \section{API} \item[min\_sim\_f\_val]: [dbl] Stop when a value below this has been found. \end{description} - % \item[history]: [numpy structured array] + % \item[history]: [numpy structured array] % \begin{description} % \item[x]: parameters given to simulation(s) % \item[f]: simulation value(s) at each x @@ -243,7 +241,6 @@ \section{API} \end{description} \end{allintypewriter} - \subsection{\texttt{sim} API} The \texttt{sim} calculations will be called by \libE with the following API:\\ @@ -290,7 +287,7 @@ \subsection{Notes:} Internally, \libE currently maintains a single data structure \texttt{H} which contains the all history information (from \texttt{sim\_specs['out'] + -gen\_specs['out']}). +gen\_specs['out']}). We have considered splitting the history \texttt{H} into multiple data structures. One possible split: @@ -298,8 +295,8 @@ \subsection{Notes:} \begin{itemize} \item[H\_in]: [numpy structured array] History of all input given to sim\_f. Rows correspond to each ``simulation evaluation''. Contains fields - in sim\_specs['in']. - + in sim\_specs['in']. + \item[H\_out]: [numpy structured array] History of all simulation output and derived quantities. Contains fields in sim\_specs['out']+gen\_specs['out'] \ sim\_specs['in'] \end{itemize} @@ -362,7 +359,7 @@ \section{Target problems} \item A Python function \begin{itemize} \item We assume this is thread-safe. - \item Use cases: + \item Use cases: \begin{itemize} \item 6-hump camel \item chwirut1.py @@ -371,7 +368,7 @@ \section{Target problems} \item An executable \begin{itemize} \item May use MPI - \item Must perform evaluations in a manner that won't conflict with other evaluations. + \item Must perform evaluations in a manner that won't conflict with other evaluations. \begin{itemize} \item Performs read/writes in the directory where it is run (or in a given directory) \end{itemize} @@ -383,7 +380,7 @@ \section{Target problems} \item Theta/Aurora: Unsure how to accomplish this at this time. \item Cray system: Unsure how to accomplish this at this time. \end{itemize} - \item Use case: + \item Use case: \begin{itemize} \item OPAL accelerator simulation [John Power and Nicole Neveu] \item LAMMPS simulation [Simon Phillpot and Eugene Ragasa] @@ -392,28 +389,27 @@ \section{Target problems} \item An MPI simulation with a subcommunicator \begin{itemize} \item Possibly stops regularly to communicate with manager - \item Use cases: + \item Use cases: \begin{itemize} - \item Possibly the HFBTHO simulation + \item Possibly the HFBTHO simulation \end{itemize} \end{itemize} - \item PETSc simulation + \item PETSc simulation \begin{itemize} \item Access to complete memory stack \item Easier to kill/monitor? - \item Use cases: + \item Use cases: \begin{itemize} \item Still considering different possibilities. \end{itemize} \end{itemize} \end{enumerate} - \clearpage \section{Initial test cases} In order to guide the initial development of \libE, we will focus on supporting the following use cases. (Objectives are intentionally selected to be easy to evaluate.) \begin{enumerate} - \item + \item \begin{description} \item[Objective:] 6-hump camel \item[Generating function:] Uniform sampling on $[0,1]^n$ with different batch sizes @@ -424,7 +420,7 @@ \section{Initial test cases} \item[Status:] Completed \end{description} \bigskip - \item + \item \begin{description} \item[Objective:] chwirut1.py \item[Generating function:] Multiple POUNDERS runs from $K$ starting points. @@ -435,18 +431,18 @@ \section{Initial test cases} \item[Status:] Completed \end{description} \bigskip - \item + \item \begin{description} \item[Objective:] chwirut1.py \item[Generating function:] Same as above, but with APOSMM giving each point and a single residual to be evaluated. \item[Functionality tested:] \ \begin{itemize} - \item Being able to give different residuals in APOSMM + \item Being able to give different residuals in APOSMM \end{itemize} \item[Status:] Completed \end{description} \bigskip - \item + \item \begin{description} \item[Objective:] HFBTHO (imbalance)/variable internal tols \item[Generating function:] POUNDERS with adaptive tolerance attempts @@ -457,10 +453,10 @@ \section{Initial test cases} \item[Status:] Currently working with Jason Sarich to get this implemented. \end{description} \bigskip - \item + \item \begin{description} - \item[Objective:] Eldad and Lauren subsurface (in TAO) - \item[Generating function:] LCAL PDECO Stefan + \item[Objective:] Eldad and Lauren subsurface (in TAO) + \item[Generating function:] LCAL PDECO Stefan \item[Functionality tested:] \ \begin{itemize} \item Stefan ?? @@ -468,10 +464,10 @@ \section{Initial test cases} \item[Status:] Looking for a sample average approximation method that can be used to generate points to be evaluated. \end{description} \bigskip - \item + \item \begin{description} \item[Objective:] chwirut1.py with stochastic noise on each residual - \item[Generating function:] POUNDERS using sample mean with the number of replications determined by iteration number. + \item[Generating function:] POUNDERS using sample mean with the number of replications determined by iteration number. \item[Functionality tested:] \ \begin{itemize} \item Efficient handling of multiple evaluations of single points @@ -484,4 +480,4 @@ \section{Initial test cases} \bibliographystyle{plain} \bibliography{../bibs/masterbib} -\end{document} +\end{document} From 3e8d32993cca1c1697168c9a7f2df035a92fcb10 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 8 Feb 2022 08:06:31 -0600 Subject: [PATCH 12/93] Spelling --- README.rst | 2 +- docs/platforms/platforms_index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index a0bcf7b3a..089d2d947 100644 --- a/README.rst +++ b/README.rst @@ -89,7 +89,7 @@ created and can be parameterized by a YAML file. As of v0.8.0+dev, workers can optionally submit generator or simulator function instances to remote funcX_ endpoints, distributing an ensemble across -systems and heterogenous resources. +systems and heterogeneous resources. The example simulation and generation functions and tests require the following: diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index bb0af420e..727c9f391 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -146,7 +146,7 @@ themselves via funcX_, a distributed, high-performance function-as-a-service pla :scale: 50 :align: center -This is useful for running ensembles across machines and heterogenous resources, but +This is useful for running ensembles across machines and heterogeneous resources, but comes with several caveats: 1. User functions registered with funcX must be *non-persistent*, since From f4a875351e38304aa2ad9acf3b4f6e3809a8fc0b Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 8 Feb 2022 08:18:27 -0600 Subject: [PATCH 13/93] Edits to readme --- .../scaling_tests/balsam_forces/readme.md | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/balsam_forces/readme.md index 6f3f804fe..beee10c46 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/readme.md +++ b/libensemble/tests/scaling_tests/balsam_forces/readme.md @@ -2,27 +2,27 @@ Naive Electostatics Code Test -This is designed only as an artificial, highly configurable test -code for a libEnsemble sim func. This variant is primarily to test libEnsemble's -capability to submit simulation functions to a separate machine from where libEnsemble's -manager and workers are running. +This is a synthetic, highly configurable simulation function. Its primary use +is to test libEnsemble's capability to submit simulation functions to a machine +that is distinct from the machine from where libEnsemble's manager and workers +are running. ### Forces Mini-App -A system of charged particles is set up and simulated over a number of time-steps. +A system of charged particles is initialized and simulated over a number of time-steps. -Particles position and charge are initiated by a random stream. +Particles' position and charge are initiated using a random stream. Particles are replicated on all ranks. -**Each rank** computes forces for a subset of particles (O(N^2)) -Particle force arrays are allreduced across ranks. -Particles are moved (replicated on each rank) -Total energy is appended to file forces.stat +**Each rank** computes forces for a subset of particles (`O(N^2)` operations). +Particle force arrays are `allreduced` across ranks. +Particles are moved (replicated on each rank). +Total energy is appended to the forces.stat file. -To run forces as a standalone executable on N procs: +To run forces as a standalone executable on `N` procs: mpirun -np N ./forces.x -This application will need to be built on the remote machine where the sims will run. +This application will need to be compiled on the remote machine where the sim_f will run. See below. ### Running with libEnsemble. @@ -33,7 +33,7 @@ On the remote machine: funcx-endpoint configure forces Configure the endpoint's `config.py` to include your project information and -match the machine's specifications. +to match the machine's specifications. See [here](https://funcx.readthedocs.io/en/latest/endpoints.html#theta-alcf) for an example ALCF Theta configuration. @@ -50,6 +50,6 @@ Application parameters can be adjusted in `funcx_forces.yaml`. Note that each function and path must be accessible and/or importable on the remote machine. Absolute paths are recommended. -To remove output before the next run: +To remove output before the next run, use: ./cleanup.sh From ae3755d3eef24f2e9148d2bfcd79363014989df3 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 8 Feb 2022 08:20:05 -0600 Subject: [PATCH 14/93] Black on forces balsam runscript --- .../balsam_forces/run_libe_forces_balsam.py | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index e16100527..6064cc28e 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -12,22 +12,30 @@ class RemoteForces(ApplicationDefinition): site = 'three' - command_template = \ - '/Users/jnavarro/Desktop/libensemble/' + \ - 'libensemble/libensemble/tests/scaling_tests/balsam_forces/forces.x' + \ - ' {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}' + \ - ' > out.txt 2>&1' + command_template = ( + '/Users/jnavarro/Desktop/libensemble/' + + 'libensemble/libensemble/tests/scaling_tests/balsam_forces/forces.x' + + ' {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}' + + ' > out.txt 2>&1' + ) exctr = NewBalsamMPIExecutor() exctr.register_app(RemoteForces, app_name='forces') -exctr.submit_allocation(site_id=239, num_nodes=1, wall_time_min=30, - queue='local', project='local') - -forces.gen_specs['user'].update({ - 'lb': np.array([0]), - 'ub': np.array([32767]) -}) +exctr.submit_allocation( + site_id=239, + num_nodes=1, + wall_time_min=30, + queue='local', + project='local', +) + +forces.gen_specs['user'].update( + { + 'lb': np.array([0]), + 'ub': np.array([32767]), + } +) forces.persis_info.add_random_streams() From 177471222d38d2630700a9a9c53db682768b0915 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 8 Feb 2022 08:20:52 -0600 Subject: [PATCH 15/93] Black on forces_simf --- .../balsam_forces/forces_simf.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index b0420f2ce..259ed0000 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -9,15 +9,15 @@ def run_forces_balsam(H, persis_info, sim_specs, libE_info): from libensemble.message_numbers import WORKER_DONE, WORKER_KILL, TASK_FAILED class ForcesException(Exception): - """ Raised on some issue with Forces """ + """Raised on some issue with Forces""" def perturb(particles, seed, max_fraction): MAX_SEED = 32767 """Modify particle count""" - seed_fraction = seed/MAX_SEED + seed_fraction = seed / MAX_SEED max_delta = particles * max_fraction delta = seed_fraction * max_delta - delta = delta - max_delta/2 # translate so -/+ + delta = delta - max_delta / 2 # translate so -/+ new_particles = particles + delta return int(new_particles) @@ -56,8 +56,15 @@ def read_last_line(filepath): args = {"sim_particles": sim_particles, "sim_timesteps": sim_timesteps, "seed": seed, "kill_rate": kill_rate} workdir = 'worker' + str(libE_info['workerID']) + '_' + secrets.token_urlsafe(nbytes=3) - task = exctr.submit(app_name='forces', app_args=args, num_procs=8, num_nodes=1, - procs_per_node=8, max_tasks_per_node=4, workdir=workdir) + task = exctr.submit( + app_name='forces', + app_args=args, + num_procs=8, + num_nodes=1, + procs_per_node=8, + max_tasks_per_node=4, + workdir=workdir, + ) # Stat file to check for bad runs statfile = 'forces.stat' @@ -65,7 +72,7 @@ def read_last_line(filepath): line = None poll_interval = 1 # secs - while(not task.finished): + while not task.finished: # Read last line of statfile line = read_last_line(filepath) if line == "kill": From 3e125819f7628431f8711f571176987320fa8c15 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 8 Feb 2022 14:30:17 -0600 Subject: [PATCH 16/93] initial round of updating READMEs --- .../scaling_tests/balsam_forces/readme.md | 28 +++++++++++-------- .../tests/scaling_tests/forces/readme.md | 23 ++++++++------- .../scaling_tests/funcx_forces/readme.md | 18 ++++++------ 3 files changed, 39 insertions(+), 30 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/balsam_forces/readme.md index beee10c46..e5fd9133d 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/readme.md +++ b/libensemble/tests/scaling_tests/balsam_forces/readme.md @@ -1,11 +1,10 @@ -## Running test run_libe_forces_funcx.py +## Running test run_libe_forces_balsam.py -Naive Electostatics Code Test +Naive Electrostatics Code Test This is a synthetic, highly configurable simulation function. Its primary use -is to test libEnsemble's capability to submit simulation functions to a machine -that is distinct from the machine from where libEnsemble's manager and workers -are running. +is to test libEnsemble's capability to submit application instances via the Balsam service, +including to separate machines from libEnsemble's processes. ### Forces Mini-App @@ -29,13 +28,20 @@ See below. On the remote machine: - pip install funcx-endpoint - funcx-endpoint configure forces + git clone https://github.com/argonne-lcf/balsam.git + cd balsam; pip install -e . + cd ..; balsam site init ./my-site -Configure the endpoint's `config.py` to include your project information and -to match the machine's specifications. -See [here](https://funcx.readthedocs.io/en/latest/endpoints.html#theta-alcf) for -an example ALCF Theta configuration. +You may be asked to login and authenticate with the Balsam service. Do so with +your ALCF credentials. + +Configure the `RemoteForces` class in the `run_libe_forces_balsam.py` calling +script to match the Balsam site name and the path to your `forces.x` executable. +Configure the path to the Balsam site's `data` directory in `balsam_forces.yaml` +to match the path to your site's corresponding directory. Configure the +`submit_allocation()` function in the calling script to correspond with your site's +ID (an integer found via `balsam site ls`), as well as the correct queue and project +for the machine the Balsam site was initialized on. Then to run with local comms (multiprocessing) with one manager and `N` workers: diff --git a/libensemble/tests/scaling_tests/forces/readme.md b/libensemble/tests/scaling_tests/forces/readme.md index e34fbce7d..09eb50c3a 100644 --- a/libensemble/tests/scaling_tests/forces/readme.md +++ b/libensemble/tests/scaling_tests/forces/readme.md @@ -1,25 +1,28 @@ ## Running test run_libe_forces.py -Naive Electostatics Code Test +Naive Electrostatics Code Test -This is designed only as an artificial, highly conifurable test -code for a libEnsemble sim func. +This is a synthetic, highly configurable simulation function. Its primary use +is to test libEnsemble's capability to launch application instances via the `MPIExecutor`. ### Forces Mini-App -A system of charged particles is set up and simulated over a number of time-steps. +A system of charged particles is initialized and simulated over a number of time-steps. -Particles position and charge are initiated by a random stream. +Particles' position and charge are initiated using a random stream. Particles are replicated on all ranks. -**Each rank** computes forces for a subset of particles (O(N^2)) -Particle force arrays are allreduced across ranks. -Particles are moved (replicated on each rank) -Total energy is appended to file forces.stat +**Each rank** computes forces for a subset of particles (`O(N^2)` operations). +Particle force arrays are `allreduced` across ranks. +Particles are moved (replicated on each rank). +Total energy is appended to the forces.stat file. -To run forces as a standalone executable on N procs: +To run forces as a standalone executable on `N` procs: mpirun -np N ./forces.x +This application will need to be compiled on the remote machine where the sim_f will run. +See below. + ### Running with libEnsemble. A random sample of seeds is taken and used as input to the sim func (forces miniapp). diff --git a/libensemble/tests/scaling_tests/funcx_forces/readme.md b/libensemble/tests/scaling_tests/funcx_forces/readme.md index 6f3f804fe..99fadb98d 100644 --- a/libensemble/tests/scaling_tests/funcx_forces/readme.md +++ b/libensemble/tests/scaling_tests/funcx_forces/readme.md @@ -1,6 +1,6 @@ ## Running test run_libe_forces_funcx.py -Naive Electostatics Code Test +Naive Electrostatics Code Test This is designed only as an artificial, highly configurable test code for a libEnsemble sim func. This variant is primarily to test libEnsemble's @@ -9,20 +9,20 @@ manager and workers are running. ### Forces Mini-App -A system of charged particles is set up and simulated over a number of time-steps. +A system of charged particles is initialized and simulated over a number of time-steps. -Particles position and charge are initiated by a random stream. +Particles' position and charge are initiated using a random stream. Particles are replicated on all ranks. -**Each rank** computes forces for a subset of particles (O(N^2)) -Particle force arrays are allreduced across ranks. -Particles are moved (replicated on each rank) -Total energy is appended to file forces.stat +**Each rank** computes forces for a subset of particles (`O(N^2)` operations). +Particle force arrays are `allreduced` across ranks. +Particles are moved (replicated on each rank). +Total energy is appended to the forces.stat file. -To run forces as a standalone executable on N procs: +To run forces as a standalone executable on `N` procs: mpirun -np N ./forces.x -This application will need to be built on the remote machine where the sims will run. +This application will need to be compiled on the remote machine where the sim_f will run. See below. ### Running with libEnsemble. From 46cbed8d57084762aba8a8aaf715f475c7a03b96 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 9 Feb 2022 10:16:52 -0600 Subject: [PATCH 17/93] trying out theta again, debug attempts --- libensemble/executors/executor.py | 2 +- libensemble/executors/new_balsam_executor.py | 1 + .../scaling_tests/balsam_forces/balsam_forces.yaml | 2 +- .../tests/scaling_tests/balsam_forces/forces_simf.py | 4 ++-- .../balsam_forces/run_libe_forces_balsam.py | 10 +++++----- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/libensemble/executors/executor.py b/libensemble/executors/executor.py index 500a17445..8ad112ed0 100644 --- a/libensemble/executors/executor.py +++ b/libensemble/executors/executor.py @@ -28,7 +28,7 @@ logger = logging.getLogger(__name__) # To change logging level for just this module -# logger.setLevel(logging.DEBUG) +logger.setLevel(logging.DEBUG) STATES = """ UNKNOWN diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 62608ca7d..d50d3eb39 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -115,6 +115,7 @@ def poll(self): # Get current state of tasks from Balsam database # self.process.refresh_from_db() balsam_state = self.process.state + print(balsam_state) self.runtime = self._get_time_since_balsam_submit() if balsam_state in ['RUN_DONE', 'POSTPROCESSED', 'STAGED_OUT', "JOB_FINISHED"]: diff --git a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml index 0e62f9af5..28b848bea 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml +++ b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml @@ -14,7 +14,7 @@ sim_specs: user: keys: - seed - balsam_data_dir: /Users/jnavarro/Desktop/libensemble/newbalsam/sites/three/data/ + balsam_data_dir: /home/jnavarro/software/sites/jln_theta/data/ sim_particles: 1.e+3 sim_timesteps: 5 sim_kill_minutes: 10.0 diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index 259ed0000..b4a27541b 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -59,9 +59,9 @@ def read_last_line(filepath): task = exctr.submit( app_name='forces', app_args=args, - num_procs=8, + num_procs=16, num_nodes=1, - procs_per_node=8, + procs_per_node=16, max_tasks_per_node=4, workdir=workdir, ) diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 6064cc28e..3b3dc17c2 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -11,9 +11,9 @@ class RemoteForces(ApplicationDefinition): - site = 'three' + site = 'jln_theta' command_template = ( - '/Users/jnavarro/Desktop/libensemble/' + '/home/jnavarro/' + 'libensemble/libensemble/tests/scaling_tests/balsam_forces/forces.x' + ' {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}' + ' > out.txt 2>&1' @@ -23,11 +23,11 @@ class RemoteForces(ApplicationDefinition): exctr = NewBalsamMPIExecutor() exctr.register_app(RemoteForces, app_name='forces') exctr.submit_allocation( - site_id=239, + site_id=246, num_nodes=1, wall_time_min=30, - queue='local', - project='local', + queue='debug-cache-quad', + project='CSC250STMS07', ) forces.gen_specs['user'].update( From 955e9fc5b70853c9f533fd6a9d3fd5aa2bfbbaac Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 9 Feb 2022 11:13:15 -0600 Subject: [PATCH 18/93] experiment with only manager process defining app, also not re-defining if already exists --- .../balsam_forces/run_libe_forces_balsam.py | 52 ++++++++++--------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 3b3dc17c2..311a7095b 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -7,36 +7,38 @@ from balsam.api import ApplicationDefinition forces = Ensemble() -forces.from_yaml('balsam_forces.yaml') +forces.from_yaml("balsam_forces.yaml") - -class RemoteForces(ApplicationDefinition): - site = 'jln_theta' - command_template = ( - '/home/jnavarro/' - + 'libensemble/libensemble/tests/scaling_tests/balsam_forces/forces.x' - + ' {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}' - + ' > out.txt 2>&1' - ) - - -exctr = NewBalsamMPIExecutor() -exctr.register_app(RemoteForces, app_name='forces') -exctr.submit_allocation( - site_id=246, - num_nodes=1, - wall_time_min=30, - queue='debug-cache-quad', - project='CSC250STMS07', -) - -forces.gen_specs['user'].update( +forces.gen_specs["user"].update( { - 'lb': np.array([0]), - 'ub': np.array([32767]), + "lb": np.array([0]), + "ub": np.array([32767]), } ) forces.persis_info.add_random_streams() +if forces.is_manager(): + RemoteForces = ApplicationDefinition.load_by_site("jln_theta").get("RemoteForce") + if not RemoteForces: + + class RemoteForces(ApplicationDefinition): + site = "jln_theta" + command_template = ( + "/home/jnavarro/" + + "libensemble/libensemble/tests/scaling_tests/balsam_forces/forces.x" + + " {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}" + + " > out.txt 2>&1" + ) + + exctr = NewBalsamMPIExecutor() + exctr.register_app(RemoteForces, app_name="forces") + exctr.submit_allocation( + site_id=246, + num_nodes=1, + wall_time_min=30, + queue="debug-cache-quad", + project="CSC250STMS07", + ) + forces.run() From b73a8d58a7920f5510820bc4da1e2456ffa798b6 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 9 Feb 2022 18:09:29 -0600 Subject: [PATCH 19/93] fix is_manager check --- .../tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 311a7095b..b3d0a5e1e 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -18,7 +18,7 @@ forces.persis_info.add_random_streams() -if forces.is_manager(): +if forces.is_manager: RemoteForces = ApplicationDefinition.load_by_site("jln_theta").get("RemoteForce") if not RemoteForces: From be945ddb83647b2a48cf4f6eb28a6a7f40feadef Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 10 Feb 2022 12:22:52 -0600 Subject: [PATCH 20/93] try turning back on refresh_from_db --- libensemble/executors/new_balsam_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index d50d3eb39..1382f4115 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -113,7 +113,7 @@ def poll(self): return # Get current state of tasks from Balsam database - # self.process.refresh_from_db() + self.process.refresh_from_db() balsam_state = self.process.state print(balsam_state) self.runtime = self._get_time_since_balsam_submit() From 2458c03fd19022d9c58b3d7efab076dfd0a38663 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 11 Feb 2022 13:46:30 -0600 Subject: [PATCH 21/93] testing something on rtd --- docs/tutorials/local_sine_tutorial.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/tutorials/local_sine_tutorial.rst b/docs/tutorials/local_sine_tutorial.rst index ed91ca29b..afdeb4d0f 100644 --- a/docs/tutorials/local_sine_tutorial.rst +++ b/docs/tutorials/local_sine_tutorial.rst @@ -116,6 +116,18 @@ For now, create a new Python file named ``generator.py``. Write the following: # Send back our output and persis_info return out, persis_info +.. container:: toggle + + .. container:: header + + **Click here to hello world** + + .. code-block:: python + :linenos: + :caption: examples/tutorials/simple_sine/tutorial_gen.py + def gen_random_sample(H, persis_info, gen_specs, _): + # does this work on readthedocs? + Our function creates ``batch_size`` random numbers uniformly distributed between the ``lower`` and ``upper`` bounds. A random stream from ``persis_info`` is used to generate these values, where they are placed From 6512086eda1f9d4563471f99715c1b3d8770419c Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 11 Feb 2022 13:52:33 -0600 Subject: [PATCH 22/93] add necessary css --- docs/_static/my_theme.css | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/_static/my_theme.css b/docs/_static/my_theme.css index bc3f00f33..f2c020405 100644 --- a/docs/_static/my_theme.css +++ b/docs/_static/my_theme.css @@ -1,3 +1,16 @@ .wy-nav-content { max-width: 850px !important; } + +.toggle .header { + display: block; + clear: both; +} + +.toggle .header:after { + content: " ▶"; +} + +.toggle .header.open:after { + content: " ▼"; +} From 3abf041efd77eee07de6afbbbe11999bb28b4885 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 11 Feb 2022 13:54:54 -0600 Subject: [PATCH 23/93] whitespace --- docs/tutorials/local_sine_tutorial.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/tutorials/local_sine_tutorial.rst b/docs/tutorials/local_sine_tutorial.rst index afdeb4d0f..19fba0cbe 100644 --- a/docs/tutorials/local_sine_tutorial.rst +++ b/docs/tutorials/local_sine_tutorial.rst @@ -125,6 +125,7 @@ For now, create a new Python file named ``generator.py``. Write the following: .. code-block:: python :linenos: :caption: examples/tutorials/simple_sine/tutorial_gen.py + def gen_random_sample(H, persis_info, gen_specs, _): # does this work on readthedocs? From 06e82573d508b203a7947a38cdd5ef4cef127016 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 11 Feb 2022 14:00:08 -0600 Subject: [PATCH 24/93] adding page template for javascript, credit to stackoverflow genius --- docs/_static/{my_theme.css => custom.css} | 0 docs/_templates/page.html | 14 ++++++++++++++ docs/conf.py | 2 +- 3 files changed, 15 insertions(+), 1 deletion(-) rename docs/_static/{my_theme.css => custom.css} (100%) create mode 100644 docs/_templates/page.html diff --git a/docs/_static/my_theme.css b/docs/_static/custom.css similarity index 100% rename from docs/_static/my_theme.css rename to docs/_static/custom.css diff --git a/docs/_templates/page.html b/docs/_templates/page.html new file mode 100644 index 000000000..b8163bdde --- /dev/null +++ b/docs/_templates/page.html @@ -0,0 +1,14 @@ +{% extends "!page.html" %} + +{% block footer %} + +{% endblock %} diff --git a/docs/conf.py b/docs/conf.py index b5319c5d5..bc258e4c4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -198,7 +198,7 @@ def __getattr__(cls, name): # html_static_path = [] def setup(app): - app.add_css_file('my_theme.css') + app.add_css_file('custom.css') # Custom sidebar templates, must be a dictionary that maps document names # to template names. From 5cbb7b540d7b8b229fc0b8be5e9ba3f16eaf78f1 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 11 Feb 2022 15:21:25 -0600 Subject: [PATCH 25/93] removing toggle block for now, now that we know it works --- docs/tutorials/local_sine_tutorial.rst | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/docs/tutorials/local_sine_tutorial.rst b/docs/tutorials/local_sine_tutorial.rst index 19fba0cbe..ed91ca29b 100644 --- a/docs/tutorials/local_sine_tutorial.rst +++ b/docs/tutorials/local_sine_tutorial.rst @@ -116,19 +116,6 @@ For now, create a new Python file named ``generator.py``. Write the following: # Send back our output and persis_info return out, persis_info -.. container:: toggle - - .. container:: header - - **Click here to hello world** - - .. code-block:: python - :linenos: - :caption: examples/tutorials/simple_sine/tutorial_gen.py - - def gen_random_sample(H, persis_info, gen_specs, _): - # does this work on readthedocs? - Our function creates ``batch_size`` random numbers uniformly distributed between the ``lower`` and ``upper`` bounds. A random stream from ``persis_info`` is used to generate these values, where they are placed From bab4b902c08a972f28477d00c60b03259b70f458 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 17 Feb 2022 16:04:18 -0600 Subject: [PATCH 26/93] comments/readme adjusts, cleanup old forces --- libensemble/executors/new_balsam_executor.py | 5 ---- .../scaling_tests/balsam_forces/readme.md | 25 +++++++++++++------ .../scaling_tests/forces/run_libe_forces.py | 14 +++-------- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 1382f4115..9f61be032 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -50,11 +50,6 @@ def __init__(self, app=None, app_args=None, workdir=None, def _get_time_since_balsam_submit(self): """Return time since balsam task entered RUNNING state""" - # If wait_on_start then can could calculate runtime same a base executor - # but otherwise that will return time from task submission. Get from Balsam. - - # self.runtime = self.process.runtime_seconds # Only reports at end of run currently - # balsam_launch_datetime = self.process.get_state_times().get('RUNNING', None) event_query = EventLog.objects.filter( job_id=self.process.id, to_state="RUNNING") if not len(event_query): diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/balsam_forces/readme.md index e5fd9133d..394d34695 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/readme.md +++ b/libensemble/tests/scaling_tests/balsam_forces/readme.md @@ -4,7 +4,12 @@ Naive Electrostatics Code Test This is a synthetic, highly configurable simulation function. Its primary use is to test libEnsemble's capability to submit application instances via the Balsam service, -including to separate machines from libEnsemble's processes. +including to separate machines from libEnsemble's processes. This means that although +this is typically a HPC scaling test, this can be run on a laptop with the `forces.x` +simulation submitted to the remote machine. + +Note that this test currently requires active ALCF credentials to authenticate with +the Balsam service. ### Forces Mini-App @@ -26,20 +31,26 @@ See below. ### Running with libEnsemble. -On the remote machine: +On the remote machine (in a conda or other virtual environment): git clone https://github.com/argonne-lcf/balsam.git - cd balsam; pip install -e . - cd ..; balsam site init ./my-site + cd balsam; pip install -e .; cd ..; + balsam login + balsam site init ./my-site + cd my-site; balsam site start You may be asked to login and authenticate with the Balsam service. Do so with your ALCF credentials. +On any machine you've installed and logged into Balsam, you can run `balsam site ls` +to list your sites and `balsam job rm --all` to remove extraneous jobs between runs. + Configure the `RemoteForces` class in the `run_libe_forces_balsam.py` calling -script to match the Balsam site name and the path to your `forces.x` executable. +script to match the Balsam site name and the path to the `forces.x` executable +on the remote machine. Configure the path to the Balsam site's `data` directory in `balsam_forces.yaml` -to match the path to your site's corresponding directory. Configure the -`submit_allocation()` function in the calling script to correspond with your site's +to match the path to the remote site's corresponding directory. Configure the +`submit_allocation()` function in the calling script to correspond with the site's ID (an integer found via `balsam site ls`), as well as the correct queue and project for the machine the Balsam site was initialized on. diff --git a/libensemble/tests/scaling_tests/forces/run_libe_forces.py b/libensemble/tests/scaling_tests/forces/run_libe_forces.py index 9bb019dfb..e43fa64ff 100644 --- a/libensemble/tests/scaling_tests/forces/run_libe_forces.py +++ b/libensemble/tests/scaling_tests/forces/run_libe_forces.py @@ -10,7 +10,6 @@ from libensemble import logger from forces_support import test_libe_stats, test_ensemble_dir, check_log_exception -USE_BALSAM = True PERSIS_GEN = False if PERSIS_GEN: @@ -36,16 +35,9 @@ import subprocess subprocess.check_call(['./build_forces.sh']) -# Create executor and register sim to it. -if USE_BALSAM: - from libensemble.executors.new_balsam_executor import NewBalsamMPIExecutor - exctr = NewBalsamMPIExecutor() - exctr.register_app(full_path=sim_app, site='three', app_name='forces') - -else: - from libensemble.executors.mpi_executor import MPIExecutor - exctr = MPIExecutor() - exctr.register_app(full_path=sim_app, app_name='forces') +from libensemble.executors.mpi_executor import MPIExecutor +exctr = MPIExecutor() +exctr.register_app(full_path=sim_app, app_name='forces') # Note: Attributes such as kill_rate are to control forces tests, this would not be a typical parameter. From dff703f5eadbe671d4d6a9bb8aa2bb019571df42 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 23 Feb 2022 16:11:45 -0600 Subject: [PATCH 27/93] initial attempt on revoke_allocation, globus data transfers --- libensemble/executors/new_balsam_executor.py | 24 ++++++-- .../balsam_forces/balsam_forces.yaml | 1 - .../balsam_forces/forces_simf.py | 59 +++++++++---------- .../balsam_forces/run_libe_forces_balsam.py | 49 ++++++++------- 4 files changed, 74 insertions(+), 59 deletions(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 9f61be032..e2304af21 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -249,11 +249,11 @@ def submit_allocation(self, site_id, num_nodes, wall_time_min, job_mode="mpi", queue="local", project="local"): """ Submits a Balsam BatchJob machine allocation request to Balsam. - Corresponding Balsam applications with a matching site can be submitted to this allocation. + Corresponding Balsam applications with a matching site can be submitted to + this allocation. """ - self.allocations.append( - BatchJob.objects.create( + allocation = BatchJob.objects.create( site_id=site_id, num_nodes=num_nodes, wall_time_min=wall_time_min, @@ -261,12 +261,24 @@ def submit_allocation(self, site_id, num_nodes, wall_time_min, job_mode="mpi", queue=queue, project=project ) - ) - logger.info("Submitted Batch allocation to endpoint {}: " + self.allocations.append(allocation) + + logger.info("Submitted Batch allocation to site {}: " "nodes {} queue {} project {}". format(site_id, num_nodes, queue, project)) + return allocation + + def revoke_allocation(self, allocation): + """ + Terminates a Balsam BatchJob remotely. Balsam apps should no longer be + submitted to this allocation. Best to run after libEnsemble completes + to save machine time. + """ + + BatchJob.objects.filter(scheduler_id=allocation.scheduler_id).update(state="pending_deletion") + def set_resources(self, resources): self.resources = resources @@ -326,7 +338,7 @@ def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, task.timer.start() task.submit_time = task.timer.tstart # Time not date - may not need if using timer. - logger.info("Submitted Balsam App to endpoint {}: " + logger.info("Submitted Balsam App to site {}: " "nodes {} ppn {}". format(App.site, num_nodes, procs_per_node)) diff --git a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml index 28b848bea..5660b6d68 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml +++ b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml @@ -14,7 +14,6 @@ sim_specs: user: keys: - seed - balsam_data_dir: /home/jnavarro/software/sites/jln_theta/data/ sim_particles: 1.e+3 sim_timesteps: 5 sim_kill_minutes: 10.0 diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index b4a27541b..502873333 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -56,48 +56,45 @@ def read_last_line(filepath): args = {"sim_particles": sim_particles, "sim_timesteps": sim_timesteps, "seed": seed, "kill_rate": kill_rate} workdir = 'worker' + str(libE_info['workerID']) + '_' + secrets.token_urlsafe(nbytes=3) + file_dest = os.getcwd() + "/forces_" + secrets.token_urlsafe(nbytes=3) + ".stat" + task = exctr.submit( app_name='forces', app_args=args, - num_procs=16, + num_procs=4, num_nodes=1, - procs_per_node=16, - max_tasks_per_node=4, - workdir=workdir, + procs_per_node=4, + max_tasks_per_node=2, + transfers={"result": "jln_laptop:"+file_dest}, + workdir=workdir ) # Stat file to check for bad runs statfile = 'forces.stat' - filepath = sim_specs['user']['balsam_data_dir'] + os.path.join(task.workdir, statfile) line = None - poll_interval = 1 # secs + poll_interval = 2 # secs while not task.finished: - # Read last line of statfile - line = read_last_line(filepath) - if line == "kill": - task.kill() # Bad run - elif task.runtime > time_limit: - task.kill() # Timeout - else: - time.sleep(poll_interval) - task.poll() - - if task.finished: - if task.state == 'FINISHED': - print("Task {} completed".format(task.name)) - calc_status = WORKER_DONE - if read_last_line(filepath) == "kill": - # Generally mark as complete if want results (completed after poll - before readline) - print("Warning: Task completed although marked as a bad run (kill flag set in forces.stat)") - elif task.state == 'FAILED': - print("Warning: Task {} failed: Error code {}".format(task.name, task.errcode)) - calc_status = TASK_FAILED - elif task.state == 'USER_KILLED': - print("Warning: Task {} has been killed".format(task.name)) - calc_status = WORKER_KILL - else: - print("Warning: Task {} in unknown state {}. Error code {}".format(task.name, task.state, task.errcode)) + time.sleep(poll_interval) + task.poll() + if task.state == 'FAILED': + break + + # if task.finished: + # if task.state == 'FINISHED': + # print("Task {} completed".format(task.name)) + # calc_status = WORKER_DONE + # if read_last_line(filepath) == "kill": + # # Generally mark as complete if want results (completed after poll - before readline) + # print("Warning: Task completed although marked as a bad run (kill flag set in forces.stat)") + # elif task.state == 'FAILED': + # print("Warning: Task {} failed: Error code {}".format(task.name, task.errcode)) + # calc_status = TASK_FAILED + # elif task.state == 'USER_KILLED': + # print("Warning: Task {} has been killed".format(task.name)) + # calc_status = WORKER_KILL + # else: + # print("Warning: Task {} in unknown state {}. Error code {}".format(task.name, task.state, task.errcode)) time.sleep(0.2) try: diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index b3d0a5e1e..8b87d3563 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -18,27 +18,34 @@ forces.persis_info.add_random_streams() -if forces.is_manager: - RemoteForces = ApplicationDefinition.load_by_site("jln_theta").get("RemoteForce") - if not RemoteForces: - - class RemoteForces(ApplicationDefinition): - site = "jln_theta" - command_template = ( - "/home/jnavarro/" - + "libensemble/libensemble/tests/scaling_tests/balsam_forces/forces.x" - + " {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}" - + " > out.txt 2>&1" - ) - - exctr = NewBalsamMPIExecutor() - exctr.register_app(RemoteForces, app_name="forces") - exctr.submit_allocation( - site_id=246, - num_nodes=1, - wall_time_min=30, - queue="debug-cache-quad", - project="CSC250STMS07", +class RemoteForces(ApplicationDefinition): + site = "three" + command_template = ( + "/Users/jnavarro/Desktop/libensemble/" + + "libensemble/libensemble/tests/scaling_tests/forces/forces.x" + + " {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}" + + " > out.txt 2>&1" ) + transfers = { + "result": { + "required": True, + "direction": "out", + "local_path": "forces.stat", + "description": "Forces stat file", + "recursive": False + } + } + +exctr = NewBalsamMPIExecutor() +exctr.register_app(RemoteForces, app_name="forces") + +batch = exctr.submit_allocation( + site_id=239, + num_nodes=1, + wall_time_min=30, +) + forces.run() + +# exctr.revoke_allocation(batch) From 130e22985101703a203c059d750dd59a702d9c07 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 24 Feb 2022 14:20:07 -0600 Subject: [PATCH 28/93] replace url-safe with token-hex --- libensemble/tests/scaling_tests/balsam_forces/forces_simf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index 502873333..a1393bdcb 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -54,9 +54,9 @@ def read_last_line(filepath): print('seed: {} particles: {}'.format(seed, sim_particles)) args = {"sim_particles": sim_particles, "sim_timesteps": sim_timesteps, "seed": seed, "kill_rate": kill_rate} - workdir = 'worker' + str(libE_info['workerID']) + '_' + secrets.token_urlsafe(nbytes=3) + workdir = 'worker' + str(libE_info['workerID']) + '_' + secrets.token_hex(nbytes=3) - file_dest = os.getcwd() + "/forces_" + secrets.token_urlsafe(nbytes=3) + ".stat" + file_dest = os.getcwd() + "/forces_" + secrets.token_hex(nbytes=3) + ".stat" task = exctr.submit( app_name='forces', From 00b32b57395863ac4a376c9c04a305b8430386b4 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 28 Feb 2022 17:42:18 -0600 Subject: [PATCH 29/93] fix executor set_complete and revoke_allocation, add logic to transfer/evaluate stats files if transferred by balsam --- libensemble/executors/new_balsam_executor.py | 16 ++++--- .../balsam_forces/balsam_forces.yaml | 3 -- .../balsam_forces/forces_simf.py | 47 ++++++++----------- .../scaling_tests/balsam_forces/readme.md | 2 + .../balsam_forces/run_libe_forces_balsam.py | 30 ++++++++---- 5 files changed, 52 insertions(+), 46 deletions(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index e2304af21..5bed05439 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -84,11 +84,9 @@ def _set_complete(self, dry_run=False): balsam_state = self.process.state self.workdir = self.workdir or self.process.working_directory self.calc_task_timing() - self.success = (balsam_state == 'JOB_FINISHED') - if balsam_state == 'JOB_FINISHED': + if balsam_state in ['RUN_DONE', 'JOB_FINISHED']: + self.success = True self.state = 'FINISHED' - elif balsam_state == 'PARENT_KILLED': # Not currently used - self.state = 'USER_KILLED' elif balsam_state in STATES: # In my states self.state = balsam_state else: @@ -110,7 +108,6 @@ def poll(self): # Get current state of tasks from Balsam database self.process.refresh_from_db() balsam_state = self.process.state - print(balsam_state) self.runtime = self._get_time_since_balsam_submit() if balsam_state in ['RUN_DONE', 'POSTPROCESSED', 'STAGED_OUT', "JOB_FINISHED"]: @@ -276,8 +273,15 @@ def revoke_allocation(self, allocation): submitted to this allocation. Best to run after libEnsemble completes to save machine time. """ + allocation.refresh_from_db() - BatchJob.objects.filter(scheduler_id=allocation.scheduler_id).update(state="pending_deletion") + while not allocation.scheduler_id: + time.sleep(0.5) + allocation.refresh_from_db() + + batchjob = BatchJob.objects.get(scheduler_id=allocation.scheduler_id) + batchjob.state = "pending_deletion" + batchjob.save() def set_resources(self, resources): self.resources = resources diff --git a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml index 5660b6d68..b3b825f0f 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml +++ b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml @@ -16,11 +16,8 @@ sim_specs: - seed sim_particles: 1.e+3 sim_timesteps: 5 - sim_kill_minutes: 10.0 particle_variance: 0.2 kill_rate: 0.5 - fail_on_sim: False - fail_on_submit: False gen_specs: function: libensemble.gen_funcs.sampling.uniform_random_sample diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index a1393bdcb..ca692212c 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -8,9 +8,6 @@ def run_forces_balsam(H, persis_info, sim_specs, libE_info): from libensemble.executors.executor import Executor from libensemble.message_numbers import WORKER_DONE, WORKER_KILL, TASK_FAILED - class ForcesException(Exception): - """Raised on some issue with Forces""" - def perturb(particles, seed, max_fraction): MAX_SEED = 32767 """Modify particle count""" @@ -30,9 +27,6 @@ def read_last_line(filepath): line = "" # In case file is empty or not yet created return line - if sim_specs['user']['fail_on_sim']: - raise ForcesException(Exception) - calc_status = 0 # Returns to worker exctr = Executor.executor @@ -40,7 +34,7 @@ def read_last_line(filepath): x = H['x'] sim_particles = sim_specs['user']['sim_particles'] sim_timesteps = sim_specs['user']['sim_timesteps'] - time_limit = sim_specs['user']['sim_kill_minutes'] * 60.0 + TRANSFER_STATFILES = sim_specs['user']['transfer'] # Get from dictionary if key exists, else return default (e.g. 0) kill_rate = sim_specs['user'].get('kill_rate', 0) @@ -57,6 +51,10 @@ def read_last_line(filepath): workdir = 'worker' + str(libE_info['workerID']) + '_' + secrets.token_hex(nbytes=3) file_dest = os.getcwd() + "/forces_" + secrets.token_hex(nbytes=3) + ".stat" + if TRANSFER_STATFILES: + transfer = {"result": "jln_laptop:"+file_dest} + else: + transfer = {} task = exctr.submit( app_name='forces', @@ -64,8 +62,8 @@ def read_last_line(filepath): num_procs=4, num_nodes=1, procs_per_node=4, - max_tasks_per_node=2, - transfers={"result": "jln_laptop:"+file_dest}, + max_tasks_per_node=1, + transfers=transfer, workdir=workdir ) @@ -80,30 +78,25 @@ def read_last_line(filepath): if task.state == 'FAILED': break - # if task.finished: - # if task.state == 'FINISHED': - # print("Task {} completed".format(task.name)) - # calc_status = WORKER_DONE - # if read_last_line(filepath) == "kill": - # # Generally mark as complete if want results (completed after poll - before readline) - # print("Warning: Task completed although marked as a bad run (kill flag set in forces.stat)") - # elif task.state == 'FAILED': - # print("Warning: Task {} failed: Error code {}".format(task.name, task.errcode)) - # calc_status = TASK_FAILED - # elif task.state == 'USER_KILLED': - # print("Warning: Task {} has been killed".format(task.name)) - # calc_status = WORKER_KILL - # else: - # print("Warning: Task {} in unknown state {}. Error code {}".format(task.name, task.state, task.errcode)) + if task.state in ['FINISHED', 'FAILED']: + print("Task {} exited with state {}.".format(task.name, task.state)) + if TRANSFER_STATFILES: + if read_last_line(file_dest) == "kill": + print("Warning: Task completed although marked as a bad run (kill flag set in retrieved forces.stat)") + calc_status = TASK_FAILED + else: + calc_status = WORKER_DONE + else: + calc_status = WORKER_DONE + else: + print(task.state) time.sleep(0.2) try: - data = np.loadtxt(filepath) - # task.read_file_in_workdir(statfile) + data = np.loadtxt(file_dest) final_energy = data[-1] except Exception: final_energy = np.nan - # print('Warning - Energy Nan') outspecs = sim_specs['out'] output = np.zeros(1, dtype=outspecs) diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/balsam_forces/readme.md index 394d34695..98850325e 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/readme.md +++ b/libensemble/tests/scaling_tests/balsam_forces/readme.md @@ -70,3 +70,5 @@ remote machine. Absolute paths are recommended. To remove output before the next run, use: ./cleanup.sh + +### diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 8b87d3563..9c770b3a8 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -3,33 +3,41 @@ from libensemble import Ensemble from libensemble.executors import NewBalsamMPIExecutor - from balsam.api import ApplicationDefinition +# Use Globus to transfer output forces.stat files back? +TRANSFER_STATFILES = True + forces = Ensemble() forces.from_yaml("balsam_forces.yaml") forces.gen_specs["user"].update( { "lb": np.array([0]), - "ub": np.array([32767]), + "ub": np.array([32767]) } ) -forces.persis_info.add_random_streams() +forces.sim_specs["user"].update( + { + "transfer": TRANSFER_STATFILES + } +) +forces.persis_info.add_random_streams() +# class RemoteForces(ApplicationDefinition): - site = "three" + site = "jln_theta" command_template = ( - "/Users/jnavarro/Desktop/libensemble/" - + "libensemble/libensemble/tests/scaling_tests/forces/forces.x" + "/home/jnavarro" + + "/libensemble/libensemble/tests/scaling_tests/forces/forces.x" + " {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}" + " > out.txt 2>&1" ) transfers = { "result": { - "required": True, + "required": False, "direction": "out", "local_path": "forces.stat", "description": "Forces stat file", @@ -41,11 +49,13 @@ class RemoteForces(ApplicationDefinition): exctr.register_app(RemoteForces, app_name="forces") batch = exctr.submit_allocation( - site_id=239, - num_nodes=1, + site_id=246, + num_nodes=4, wall_time_min=30, + queue="debug-flat-quad", + project="CSC250STMS07" ) forces.run() -# exctr.revoke_allocation(batch) +exctr.revoke_allocation(batch) From e287ee0c9a7f38998564f7dffdf6845a1c573c6d Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 28 Feb 2022 17:44:30 -0600 Subject: [PATCH 30/93] black --- .../balsam_forces/forces_simf.py | 45 +++++++++++-------- .../balsam_forces/run_libe_forces_balsam.py | 19 +++----- 2 files changed, 31 insertions(+), 33 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index ca692212c..d574ed648 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -21,7 +21,7 @@ def perturb(particles, seed, max_fraction): def read_last_line(filepath): """Read last line of statfile""" try: - with open(filepath, 'rb') as fh: + with open(filepath, "rb") as fh: line = fh.readlines()[-1].decode().rstrip() except Exception: line = "" # In case file is empty or not yet created @@ -31,58 +31,65 @@ def read_last_line(filepath): exctr = Executor.executor - x = H['x'] - sim_particles = sim_specs['user']['sim_particles'] - sim_timesteps = sim_specs['user']['sim_timesteps'] - TRANSFER_STATFILES = sim_specs['user']['transfer'] + x = H["x"] + sim_particles = sim_specs["user"]["sim_particles"] + sim_timesteps = sim_specs["user"]["sim_timesteps"] + TRANSFER_STATFILES = sim_specs["user"]["transfer"] # Get from dictionary if key exists, else return default (e.g. 0) - kill_rate = sim_specs['user'].get('kill_rate', 0) - particle_variance = sim_specs['user'].get('particle_variance', 0) + kill_rate = sim_specs["user"].get("kill_rate", 0) + particle_variance = sim_specs["user"].get("particle_variance", 0) # Composing variable names and x values to set up simulation seed = int(np.rint(x[0][0])) # This is to give a random variance of work-load sim_particles = perturb(sim_particles, seed, particle_variance) - print('seed: {} particles: {}'.format(seed, sim_particles)) + print("seed: {} particles: {}".format(seed, sim_particles)) - args = {"sim_particles": sim_particles, "sim_timesteps": sim_timesteps, "seed": seed, "kill_rate": kill_rate} - workdir = 'worker' + str(libE_info['workerID']) + '_' + secrets.token_hex(nbytes=3) + args = { + "sim_particles": sim_particles, + "sim_timesteps": sim_timesteps, + "seed": seed, + "kill_rate": kill_rate, + } + workdir = "worker" + str(libE_info["workerID"]) + "_" + secrets.token_hex(nbytes=3) file_dest = os.getcwd() + "/forces_" + secrets.token_hex(nbytes=3) + ".stat" if TRANSFER_STATFILES: - transfer = {"result": "jln_laptop:"+file_dest} + transfer = {"result": "jln_laptop:" + file_dest} else: transfer = {} task = exctr.submit( - app_name='forces', + app_name="forces", app_args=args, num_procs=4, num_nodes=1, procs_per_node=4, max_tasks_per_node=1, transfers=transfer, - workdir=workdir + workdir=workdir, ) # Stat file to check for bad runs - statfile = 'forces.stat' + statfile = "forces.stat" line = None poll_interval = 2 # secs while not task.finished: time.sleep(poll_interval) task.poll() - if task.state == 'FAILED': + if task.state == "FAILED": break - if task.state in ['FINISHED', 'FAILED']: + if task.state in ["FINISHED", "FAILED"]: print("Task {} exited with state {}.".format(task.name, task.state)) if TRANSFER_STATFILES: if read_last_line(file_dest) == "kill": - print("Warning: Task completed although marked as a bad run (kill flag set in retrieved forces.stat)") + print( + "Warning: Task completed although marked as a bad run (kill flag set in retrieved forces.stat)" + ) calc_status = TASK_FAILED else: calc_status = WORKER_DONE @@ -98,8 +105,8 @@ def read_last_line(filepath): except Exception: final_energy = np.nan - outspecs = sim_specs['out'] + outspecs = sim_specs["out"] output = np.zeros(1, dtype=outspecs) - output['energy'][0] = final_energy + output["energy"][0] = final_energy return output, persis_info, calc_status diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 9c770b3a8..9c0747004 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -11,18 +11,8 @@ forces = Ensemble() forces.from_yaml("balsam_forces.yaml") -forces.gen_specs["user"].update( - { - "lb": np.array([0]), - "ub": np.array([32767]) - } -) - -forces.sim_specs["user"].update( - { - "transfer": TRANSFER_STATFILES - } -) +forces.gen_specs["user"].update({"lb": np.array([0]), "ub": np.array([32767])}) +forces.sim_specs["user"].update({"transfer": TRANSFER_STATFILES}) forces.persis_info.add_random_streams() # @@ -41,10 +31,11 @@ class RemoteForces(ApplicationDefinition): "direction": "out", "local_path": "forces.stat", "description": "Forces stat file", - "recursive": False + "recursive": False, } } + exctr = NewBalsamMPIExecutor() exctr.register_app(RemoteForces, app_name="forces") @@ -53,7 +44,7 @@ class RemoteForces(ApplicationDefinition): num_nodes=4, wall_time_min=30, queue="debug-flat-quad", - project="CSC250STMS07" + project="CSC250STMS07", ) forces.run() From 7b92854d583468d978887b966fbf9962ac481147 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 1 Mar 2022 13:40:02 -0600 Subject: [PATCH 31/93] initial globus docs in readme, additional improvements --- libensemble/executors/new_balsam_executor.py | 2 +- .../balsam_forces/forces_simf.py | 10 ++++- .../scaling_tests/balsam_forces/readme.md | 38 +++++++++++++++---- .../balsam_forces/run_libe_forces_balsam.py | 7 +++- 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 5bed05439..aa93da6df 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -276,7 +276,7 @@ def revoke_allocation(self, allocation): allocation.refresh_from_db() while not allocation.scheduler_id: - time.sleep(0.5) + time.sleep(1) allocation.refresh_from_db() batchjob = BatchJob.objects.get(scheduler_id=allocation.scheduler_id) diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index d574ed648..0dc7d44d2 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -35,6 +35,7 @@ def read_last_line(filepath): sim_particles = sim_specs["user"]["sim_particles"] sim_timesteps = sim_specs["user"]["sim_timesteps"] TRANSFER_STATFILES = sim_specs["user"]["transfer"] + globus_endpoint = sim_specs["user"]["globus_endpoint"] # Get from dictionary if key exists, else return default (e.g. 0) kill_rate = sim_specs["user"].get("kill_rate", 0) @@ -57,7 +58,7 @@ def read_last_line(filepath): file_dest = os.getcwd() + "/forces_" + secrets.token_hex(nbytes=3) + ".stat" if TRANSFER_STATFILES: - transfer = {"result": "jln_laptop:" + file_dest} + transfer = {"result": globus_endpoint + ":" + file_dest} else: transfer = {} @@ -86,6 +87,11 @@ def read_last_line(filepath): if task.state in ["FINISHED", "FAILED"]: print("Task {} exited with state {}.".format(task.name, task.state)) if TRANSFER_STATFILES: + print("Waiting for Task {} statfile.".format(task.name)) + while file_dest not in [ + os.path.join(os.getcwd(), i) for i in os.listdir(".") + ]: + time.sleep(1) if read_last_line(file_dest) == "kill": print( "Warning: Task completed although marked as a bad run (kill flag set in retrieved forces.stat)" @@ -93,8 +99,10 @@ def read_last_line(filepath): calc_status = TASK_FAILED else: calc_status = WORKER_DONE + print("Task completed successfully. forces.stat retrieved.") else: calc_status = WORKER_DONE + print("Task completed.") else: print(task.state) diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/balsam_forces/readme.md index 98850325e..13d2a0c9f 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/readme.md +++ b/libensemble/tests/scaling_tests/balsam_forces/readme.md @@ -29,7 +29,7 @@ To run forces as a standalone executable on `N` procs: This application will need to be compiled on the remote machine where the sim_f will run. See below. -### Running with libEnsemble. +### Configuring Balsam On the remote machine (in a conda or other virtual environment): @@ -45,14 +45,13 @@ your ALCF credentials. On any machine you've installed and logged into Balsam, you can run `balsam site ls` to list your sites and `balsam job rm --all` to remove extraneous jobs between runs. +### Configuring and Running libEnsemble. + Configure the `RemoteForces` class in the `run_libe_forces_balsam.py` calling script to match the Balsam site name and the path to the `forces.x` executable -on the remote machine. -Configure the path to the Balsam site's `data` directory in `balsam_forces.yaml` -to match the path to the remote site's corresponding directory. Configure the -`submit_allocation()` function in the calling script to correspond with the site's -ID (an integer found via `balsam site ls`), as well as the correct queue and project -for the machine the Balsam site was initialized on. +on the remote machine. Configure the `submit_allocation()` function in the calling +script to correspond with the site's ID (an integer found via `balsam site ls`), +as well as the correct queue and project for the machine the Balsam site was initialized on. Then to run with local comms (multiprocessing) with one manager and `N` workers: @@ -71,4 +70,27 @@ To remove output before the next run, use: ./cleanup.sh -### +### (Optional) Configuring data-transfer via Balsam and Globus + +Although the raw results of forces runs are available in Balsam sites, remote or +local, this is understandably insufficient for the simulation function's capability +to evaluate results and determine the final status of an app run if it's running +on another machine. + +Balsam can coordinate data transfers via Globus between Globus endpoints. Assuming +this test is being run on a personal device, do the following to configure Globus, +then Balsam to use Globus. + +- Login to [Globus](https://www.globus.org/) using ALCF or other approved organization credentials. +- Download and run [Globus Connect Personal](https://app.globus.org/file-manager/gcp) to register your device as a Globus endpoint. +- Once a Globus collection has been initialized in Globus Connect Personal, login to Globus, click "Endpoints" on the left. +- Click the collection that was created on your personal device. Copy the string after "Endpoint UUID". +- Login to the remote machine, switch to your Balsam site directory, run ``balsam site globus-login``. +- Modify ``settings.yml`` to contain a new transfer_location that matches your device, with the copied endpoint UUID. +- Run ``balsam site sync`` within the site directory to save these changes. +- Locally, in the calling script, enable ``TRANSFER_STATFILES`` and set ``GLOBUS_ENDPOINT`` to the key for the previously-defined transfer_location + +This should be sufficient for ``forces.stat`` files from remote Balsam app runs +to be transferred back to your local launch directory after every app run. The +simulation function will wait for Balsam to transfer back a stat file, then determine +the calc status based on the received output. diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 9c0747004..3c22f365c 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -5,14 +5,17 @@ from libensemble.executors import NewBalsamMPIExecutor from balsam.api import ApplicationDefinition -# Use Globus to transfer output forces.stat files back? +# Use Globus to transfer output forces.stat files back TRANSFER_STATFILES = True +GLOBUS_ENDPOINT = "jln_laptop" forces = Ensemble() forces.from_yaml("balsam_forces.yaml") forces.gen_specs["user"].update({"lb": np.array([0]), "ub": np.array([32767])}) -forces.sim_specs["user"].update({"transfer": TRANSFER_STATFILES}) +forces.sim_specs["user"].update( + {"transfer": TRANSFER_STATFILES, "globus_endpoint": GLOBUS_ENDPOINT} +) forces.persis_info.add_random_streams() # From 385399bdbedc567f7b570cac38c758f28fedd3ed Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 1 Mar 2022 13:46:24 -0600 Subject: [PATCH 32/93] adding POSTPROCESSED as a success balsam state, small fix to cleanup, add message for start of polling --- libensemble/executors/new_balsam_executor.py | 2 +- libensemble/tests/scaling_tests/balsam_forces/cleanup.sh | 2 +- libensemble/tests/scaling_tests/balsam_forces/forces_simf.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index aa93da6df..6eb36fbb0 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -84,7 +84,7 @@ def _set_complete(self, dry_run=False): balsam_state = self.process.state self.workdir = self.workdir or self.process.working_directory self.calc_task_timing() - if balsam_state in ['RUN_DONE', 'JOB_FINISHED']: + if balsam_state in ['RUN_DONE', 'JOB_FINISHED', 'POSTPROCESSED']: self.success = True self.state = 'FINISHED' elif balsam_state in STATES: # In my states diff --git a/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh b/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh index 54c41aa6e..e3ec82dee 100755 --- a/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh +++ b/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh @@ -1 +1 @@ -rm -r ensemble_* *.npy *.pickle ensemble.log lib*.txt +rm -r ensemble_* *.npy *.pickle ensemble.log lib*.txt *.stat diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index 0dc7d44d2..a6534be9c 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -78,6 +78,7 @@ def read_last_line(filepath): line = None poll_interval = 2 # secs + print("Beginning to poll Task {}".format(task.name)) while not task.finished: time.sleep(poll_interval) task.poll() From fd579f97d679ca295f8ef1a41de56817ae4eaf32 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 1 Mar 2022 17:20:00 -0600 Subject: [PATCH 33/93] black, refactoring, new documentation --- libensemble/executors/new_balsam_executor.py | 342 ++++++++++++++----- 1 file changed, 249 insertions(+), 93 deletions(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 6eb36fbb0..c22487806 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -18,8 +18,14 @@ import time import datetime -from libensemble.executors.executor import \ - Application, Task, ExecutorException, TimeoutExpired, jassert, STATES +from libensemble.executors.executor import ( + Application, + Task, + ExecutorException, + TimeoutExpired, + jassert, + STATES, +) from libensemble.executors.mpi_executor import MPIExecutor from balsam.api import Job, BatchJob, EventLog @@ -30,14 +36,24 @@ class BalsamTask(Task): - """Wraps a Balsam Task from the Balsam service + """Wraps a Balsam Job from the Balsam service. - The same attributes and query routines are implemented. + The same attributes and query routines are implemented. Use ``task.process`` + to refer to the matching Balsam Job initialized by the NewBalsamMPIExecutor, + with every Balsam Job method invokable on it. Otherwise, libEnsemble task methods + like ``poll()`` can be used directly. """ - def __init__(self, app=None, app_args=None, workdir=None, - stdout=None, stderr=None, workerid=None): + def __init__( + self, + app=None, + app_args=None, + workdir=None, + stdout=None, + stderr=None, + workerid=None, + ): """Instantiate a new BalsamTask instance. A new BalsamTask object is created with an id, status and @@ -51,7 +67,8 @@ def _get_time_since_balsam_submit(self): """Return time since balsam task entered RUNNING state""" event_query = EventLog.objects.filter( - job_id=self.process.id, to_state="RUNNING") + job_id=self.process.id, to_state="RUNNING" + ) if not len(event_query): return 0 balsam_launch_datetime = event_query[0].timestamp @@ -79,26 +96,33 @@ def _set_complete(self, dry_run=False): self.finished = True if dry_run: self.success = True - self.state = 'FINISHED' + self.state = "FINISHED" else: balsam_state = self.process.state self.workdir = self.workdir or self.process.working_directory self.calc_task_timing() - if balsam_state in ['RUN_DONE', 'JOB_FINISHED', 'POSTPROCESSED']: + if balsam_state in [ + "RUN_DONE", + "POSTPROCESSED", + "STAGED_OUT", + "JOB_FINISHED", + ]: self.success = True - self.state = 'FINISHED' + self.state = "FINISHED" elif balsam_state in STATES: # In my states self.state = balsam_state else: - logger.warning("Task finished, but in unrecognized " - "Balsam state {}".format(balsam_state)) - self.state = 'UNKNOWN' + logger.warning( + "Task finished, but in unrecognized " + "Balsam state {}".format(balsam_state) + ) + self.state = "UNKNOWN" - logger.info("Task {} ended with state {}". - format(self.name, self.state)) + logger.info("Task {} ended with state {}".format(self.name, self.state)) def poll(self): - """Polls and updates the status attributes of the supplied task""" + """Polls and updates the status attributes of the supplied task. Requests + Job information from Balsam service.""" if self.dry_run: return @@ -110,24 +134,30 @@ def poll(self): balsam_state = self.process.state self.runtime = self._get_time_since_balsam_submit() - if balsam_state in ['RUN_DONE', 'POSTPROCESSED', 'STAGED_OUT', "JOB_FINISHED"]: + if balsam_state in ["RUN_DONE", "POSTPROCESSED", "STAGED_OUT", "JOB_FINISHED"]: self._set_complete() - elif balsam_state in ['RUNNING']: - self.state = 'RUNNING' + elif balsam_state in ["RUNNING"]: + self.state = "RUNNING" self.workdir = self.workdir or self.process.working_directory - elif balsam_state in ['CREATED', 'AWAITING_PARENTS', - 'READY', 'STAGED_IN', 'PREPROCESSED']: - self.state = 'WAITING' + elif balsam_state in [ + "CREATED", + "AWAITING_PARENTS", + "READY", + "STAGED_IN", + "PREPROCESSED", + ]: + self.state = "WAITING" - elif balsam_state in ['RUN_ERROR', 'RUN_TIMEOUT', 'FAILED']: - self.state = 'FAILED' + elif balsam_state in ["RUN_ERROR", "RUN_TIMEOUT", "FAILED"]: + self.state = "FAILED" else: raise ExecutorException( "Task state returned from Balsam is not in known list of " - "Balsam states. Task state is {}".format(balsam_state)) + "Balsam states. Task state is {}".format(balsam_state) + ) def wait(self, timeout=None): """Waits on completion of the task or raises TimeoutExpired exception @@ -137,7 +167,7 @@ def wait(self, timeout=None): Parameters ---------- - timeout: + timeout: int Time in seconds after which a TimeoutExpired exception is raised""" if self.dry_run: @@ -149,7 +179,12 @@ def wait(self, timeout=None): # Wait on the task start = time.time() self.process.refresh_from_db() - while self.process.state not in ['RUN_DONE', 'POSTPROCESSED', 'STAGED_OUT', "JOB_FINISHED"]: + while self.process.state not in [ + "RUN_DONE", + "POSTPROCESSED", + "STAGED_OUT", + "JOB_FINISHED", + ]: time.sleep(0.2) self.process.refresh_from_db() if timeout and time.time() - start > timeout: @@ -159,23 +194,25 @@ def wait(self, timeout=None): self.runtime = self._get_time_since_balsam_submit() self._set_complete() - def kill(self, wait_time=None): - """ Kills or cancels the supplied task """ + def kill(self): + """Cancels the supplied task. Killing is unsupported at this time.""" self.process.delete() logger.info("Killing task {}".format(self.name)) - self.state = 'USER_KILLED' + self.state = "USER_KILLED" self.finished = True self.calc_task_timing() class NewBalsamMPIExecutor(MPIExecutor): - """Inherits from MPIExecutor and wraps the Balsam task management service + """Inherits from MPIExecutor and wraps the Balsam service. Via this Executor, + Balsam Jobs can be submitted to Balsam sites, either local or on remote machines. .. note:: Task kills are not configurable in the Balsam executor. """ + def __init__(self, custom_info={}): """Instantiate a new BalsamMPIExecutor instance. @@ -184,7 +221,9 @@ def __init__(self, custom_info={}): """ if custom_info: - logger.warning("The Balsam executor does not support custom_info - ignoring") + logger.warning( + "The Balsam executor does not support custom_info - ignoring" + ) super().__init__(custom_info) @@ -192,34 +231,23 @@ def __init__(self, custom_info={}): self.allocations = [] def serial_setup(self): - """Balsam serial setup includes empyting database and adding applications""" + """Balsam serial setup includes emptying database and adding applications""" pass - # for app in self.apps.values(): - # calc_name = app.gname - # desc = app.desc - # full_path = app.full_path - # site = app.site - # self.add_app(calc_name, site, full_path, desc) - def add_app(self, name, site, exepath, desc): - """ Sync application with balsam service """ + """Sync application with balsam service""" pass - logger.debug("Added App {}".format(name)) def register_app(self, BalsamApp, app_name, calc_type=None, desc=None): - """Registers a Balsam application instance to libEnsemble. - - The ``full_path`` of the application must be supplied. Either - ``app_name`` or ``calc_type`` can be used to identify the - application in user scripts (in the **submit** function). - ``app_name`` is recommended. + """Registers a Balsam ApplicationDefinition to libEnsemble. This class + instance *must* have a ``site`` and ``command_template`` specified. See + the Balsam docs for information on other optional fields. Parameters ---------- - full_path: String - The full path of the user application to be registered + BalsamApp: ApplicationDefinition object + A Balsam ApplicationDefinition instance. app_name: String, optional Name to identify this application. @@ -238,40 +266,83 @@ def register_app(self, BalsamApp, app_name, calc_type=None, desc=None): # Default sim/gen apps will be deprecated. Just use names. if calc_type is not None: - jassert(calc_type in self.default_apps, - "Unrecognized calculation type", calc_type) + jassert( + calc_type in self.default_apps, + "Unrecognized calculation type", + calc_type, + ) self.default_apps[calc_type] = self.apps[app_name] - def submit_allocation(self, site_id, num_nodes, wall_time_min, job_mode="mpi", - queue="local", project="local"): + def submit_allocation( + self, + site_id, + num_nodes, + wall_time_min, + job_mode="mpi", + queue="local", + project="local", + ): """ - Submits a Balsam BatchJob machine allocation request to Balsam. + Submits a Balsam ``BatchJob`` machine allocation request to Balsam. Corresponding Balsam applications with a matching site can be submitted to this allocation. + + Parameters + ---------- + + site_id: int + The corresponding site_id for a Balsam site. Retrieve via ``balsam site ls`` + + num_nodes: int + The number of nodes to request from a machine with a running Balsam site + + wall_time_min: int + The number of walltime minutes to request for the BatchJob allocation + + job_mode: String, optional + Either "serial" or "mpi". Default: "mpi" + + queue: String, optional + Specifies the queue from which the BatchJob should request nodes. Default: "local" + + project: String, optional + Specifies the project that should be charged for the requested hours. Default: "local" + + Returns + ------- + + The corresponding ``BatchJob`` object. """ allocation = BatchJob.objects.create( - site_id=site_id, - num_nodes=num_nodes, - wall_time_min=wall_time_min, - job_mode=job_mode, - queue=queue, - project=project - ) + site_id=site_id, + num_nodes=num_nodes, + wall_time_min=wall_time_min, + job_mode=job_mode, + queue=queue, + project=project, + ) self.allocations.append(allocation) - logger.info("Submitted Batch allocation to site {}: " - "nodes {} queue {} project {}". - format(site_id, num_nodes, queue, project)) + logger.info( + "Submitted Batch allocation to site {}: " + "nodes {} queue {} project {}".format(site_id, num_nodes, queue, project) + ) return allocation def revoke_allocation(self, allocation): """ - Terminates a Balsam BatchJob remotely. Balsam apps should no longer be - submitted to this allocation. Best to run after libEnsemble completes - to save machine time. + Terminates a Balsam BatchJob machine allocation remotely. Balsam apps should + no longer be submitted to this allocation. Best to run after libEnsemble + completes, or after this BatchJob is no longer needed. Helps save machine time. + + Parameters + ---------- + + allocation: BatchJob object + a BatchJob with a corresponding machine allocation that should be cancelled. """ allocation.refresh_from_db() @@ -286,14 +357,87 @@ def revoke_allocation(self, allocation): def set_resources(self, resources): self.resources = resources - def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, - num_nodes=None, procs_per_node=None, max_tasks_per_node=None, - machinefile=None, gpus_per_rank=0, transfers={}, - workdir='', dry_run=False, wait_on_start=False, extra_args={}): - """Creates a new task, and either executes or schedules to execute - in the executor + def submit( + self, + calc_type=None, + app_name=None, + app_args=None, + num_procs=None, + num_nodes=None, + procs_per_node=None, + max_tasks_per_node=None, + machinefile=None, + gpus_per_rank=0, + transfers={}, + workdir="", + dry_run=False, + wait_on_start=False, + extra_args={}, + ): + """Initializes and submits a Balsam Job based on a registered ApplicationDefinition + and requested resource parameters. A corresponding libEnsemble Task object + is created and returned. + + calc_type: String, optional + The calculation type: 'sim' or 'gen' + Only used if app_name is not supplied. Uses default sim or gen application. + + app_name: String, optional + The application name. Must be supplied if calc_type is not. + + app_args: dict + A dictionary of options that correspond to fields to template in the + ApplicationDefinition's ``command_template`` field. + + num_procs: int, optional + The total number of MPI ranks on which to submit the task + + num_nodes: int, optional + The number of nodes on which to submit the task + + procs_per_node: int, optional + The processes per node for this task + + max_tasks_per_node: int + Instructs Balsam to schedule at most this many Jobs per node. + + machinefile: string, optional + Name of a machinefile for this task to use. Unused by Balsam + + gpus_per_rank: int + Number of GPUs to reserve for each MPI rank + + transfers: dict + A Job-specific Balsam transfers dictionary that corresponds with an + ApplicationDefinition ``transfers`` field. See the Balsam docs for + more information. + + workdir: String + Specifies as name for the Job's output directory within the Balsam site's + data directory. Default: libe_workflow + + dry_run: boolean, optional + Whether this is a dry_run - no task will be launched; instead + runline is printed to logger (at INFO level) + + wait_on_start: boolean, optional + Whether to block, and wait for task to be polled as RUNNING (or other + active/end state) before continuing + + extra_args: dict + Additional arguments to supply to MPI runner. + + Returns + ------- + + task: obj: Task + The launched task object + + Note that since Balsam Jobs are often sent to entirely different machines + than where libEnsemble is running, that how libEnsemble's resource manager + has divided local resources among workers doesn't impact what resources + can be requested for a Balsam Job running on an entirely different machine. - The created task object is returned. """ if app_name is not None: @@ -311,40 +455,52 @@ def submit(self, calc_type=None, app_name=None, app_args=None, num_procs=None, # Specific to this class if machinefile is not None: logger.warning("machinefile arg ignored - not supported in Balsam") - jassert(num_procs or num_nodes or procs_per_node, - "No procs/nodes provided - aborting") + jassert( + num_procs or num_nodes or procs_per_node, + "No procs/nodes provided - aborting", + ) + + if not len(self.allocations): + logger.warning( + "Balsam Job submitted with no active BatchJobs! Initialize a matching BatchJob." + ) - task = BalsamTask(app, app_args, workdir, - None, None, self.workerID) + task = BalsamTask(app, app_args, workdir, None, None, self.workerID) if dry_run: task.dry_run = True - logger.info('Test (No submit) Balsam app {}'.format(app_name)) + logger.info("Test (No submit) Balsam app {}".format(app_name)) task._set_complete(dry_run=True) else: App = app.pyobj App.sync() - task.process = Job(app_id=App, workdir=workdir, - parameters=app_args, - num_nodes=num_nodes, - ranks_per_node=procs_per_node, - launch_params=extra_args, - gpus_per_rank=gpus_per_rank, - node_packing_count=max_tasks_per_node, - transfers=transfers) + task.process = Job( + app_id=App, + workdir=workdir, + parameters=app_args, + num_nodes=num_nodes, + ranks_per_node=procs_per_node, + launch_params=extra_args, + gpus_per_rank=gpus_per_rank, + node_packing_count=max_tasks_per_node, + transfers=transfers, + ) task.process.save() - if (wait_on_start): + if wait_on_start: self._wait_on_start(task) if not task.timer.timing: task.timer.start() - task.submit_time = task.timer.tstart # Time not date - may not need if using timer. + task.submit_time = ( + task.timer.tstart + ) # Time not date - may not need if using timer. - logger.info("Submitted Balsam App to site {}: " - "nodes {} ppn {}". - format(App.site, num_nodes, procs_per_node)) + logger.info( + "Submitted Balsam App to site {}: " + "nodes {} ppn {}".format(App.site, num_nodes, procs_per_node) + ) # task.workdir = task.process.working_directory # Might not be set yet! self.list_of_tasks.append(task) From 6d4eb4787e1e78f181ae0f08ae58f0c83a535ec5 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 1 Mar 2022 17:36:16 -0600 Subject: [PATCH 34/93] slight rename of new Balsam Executor, adds autodocing, fixes class methods not displaying --- docs/executor/balsam_executor.rst | 32 +++++++++++++++---- docs/executor/ex_index.rst | 9 +++--- libensemble/executors/__init__.py | 4 +-- libensemble/executors/new_balsam_executor.py | 26 +++++++-------- .../balsam_forces/run_libe_forces_balsam.py | 4 +-- 5 files changed, 45 insertions(+), 30 deletions(-) diff --git a/docs/executor/balsam_executor.rst b/docs/executor/balsam_executor.rst index 83b7a0482..8f1de6930 100644 --- a/docs/executor/balsam_executor.rst +++ b/docs/executor/balsam_executor.rst @@ -1,17 +1,35 @@ -Balsam MPI Executor -=================== +Balsam Executors +================ + +Balsam 2 Executor +----------------- + +.. automodule:: new_balsam_executor + :no-undoc-members: + +.. autoclass:: NewBalsamExecutor + :show-inheritance: + :members: __init__, register_app, submit_allocation, revoke_allocation, submit + +.. autoclass:: NewBalsamTask + :show-inheritance: + :member-order: bysource + :members: poll, wait, kill + +Balsam 1 MPI Executor +--------------------- .. automodule:: balsam_executor :no-undoc-members: .. autoclass:: BalsamMPIExecutor :show-inheritance: -.. :inherited-members: -.. :member-order: bysource -.. :members: __init__, submit, poll, manager_poll, kill, set_kill_mode + :inherited-members: + :member-order: bysource + :members: __init__, submit, poll, manager_poll, kill, set_kill_mode .. autoclass:: BalsamTask :show-inheritance: :member-order: bysource -.. :members: workdir_exists, file_exists_in_workdir, read_file_in_workdir, stdout_exists, read_stdout -.. :inherited-members: + :members: workdir_exists, file_exists_in_workdir, read_file_in_workdir, stdout_exists, read_stdout + :inherited-members: diff --git a/docs/executor/ex_index.rst b/docs/executor/ex_index.rst index ded4f9061..0ba710537 100644 --- a/docs/executor/ex_index.rst +++ b/docs/executor/ex_index.rst @@ -1,11 +1,10 @@ .. _executor_index: -Executor -======== +Executors +========= -libEnsemble's Executor can be used within the simulator (and, potentially, the generator) -functions to provide a simple, portable interface for running and managing user -applications. +libEnsemble's Executors can be used within user functions to provide a simple, +portable interface for running and managing user applications. .. toctree:: :maxdepth: 2 diff --git a/libensemble/executors/__init__.py b/libensemble/executors/__init__.py index 98b9175cd..aab130004 100644 --- a/libensemble/executors/__init__.py +++ b/libensemble/executors/__init__.py @@ -1,10 +1,10 @@ from libensemble.executors.executor import Executor from libensemble.executors.mpi_executor import MPIExecutor -from libensemble.executors.new_balsam_executor import NewBalsamMPIExecutor +from libensemble.executors.new_balsam_executor import NewBalsamExecutor import os import sys if 'BALSAM_DB_PATH' in os.environ and int(sys.version[2]) >= 6: from libensemble.executors.balsam_executor import BalsamMPIExecutor -__all__ = ['BalsamMPIExecutor', 'Executor', 'MPIExecutor', 'NewBalsamMPIExecutor'] +__all__ = ['BalsamMPIExecutor', 'Executor', 'MPIExecutor', 'NewBalsamExecutor'] diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index c22487806..11cb06b80 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -1,15 +1,13 @@ """ -This module launches and controls the running of tasks with Balsam. - -.. note:: Balsam is supported only when using ``mpi`` comms and requires Python 3.6 or higher. +This module launches and controls the running of tasks with Balsam 2, and most +notably can submit tasks from any machine, to any machine running a Balsam site. In order to create a Balsam executor, the calling script should contain :: - exctr = BalsamMPIExecutor() + exctr = NewBalsamExecutor() -The Balsam executor inherits from the MPI executor. See the -:doc:`MPIExecutor` for shared API. Any differences are -shown below. +One key difference to consider is that instead of registering paths to apps, +Balsam ApplicationDefinition instances must be registered instead. """ @@ -26,7 +24,7 @@ jassert, STATES, ) -from libensemble.executors.mpi_executor import MPIExecutor +from libensemble.executors import Executor from balsam.api import Job, BatchJob, EventLog @@ -35,11 +33,11 @@ # logger.setLevel(logging.DEBUG) -class BalsamTask(Task): +class NewBalsamTask(Task): """Wraps a Balsam Job from the Balsam service. The same attributes and query routines are implemented. Use ``task.process`` - to refer to the matching Balsam Job initialized by the NewBalsamMPIExecutor, + to refer to the matching Balsam Job initialized by the NewBalsamExecutor, with every Balsam Job method invokable on it. Otherwise, libEnsemble task methods like ``poll()`` can be used directly. @@ -54,9 +52,9 @@ def __init__( stderr=None, workerid=None, ): - """Instantiate a new BalsamTask instance. + """Instantiate a new NewBalsamTask instance. - A new BalsamTask object is created with an id, status and + A new NewBalsamTask object is created with an id, status and configuration attributes. This will normally be created by the executor on a submission. """ @@ -205,7 +203,7 @@ def kill(self): self.calc_task_timing() -class NewBalsamMPIExecutor(MPIExecutor): +class NewBalsamExecutor(Executor): """Inherits from MPIExecutor and wraps the Balsam service. Via this Executor, Balsam Jobs can be submitted to Balsam sites, either local or on remote machines. @@ -465,7 +463,7 @@ def submit( "Balsam Job submitted with no active BatchJobs! Initialize a matching BatchJob." ) - task = BalsamTask(app, app_args, workdir, None, None, self.workerID) + task = NewBalsamTask(app, app_args, workdir, None, None, self.workerID) if dry_run: task.dry_run = True diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 3c22f365c..80a68589f 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -2,7 +2,7 @@ import numpy as np from libensemble import Ensemble -from libensemble.executors import NewBalsamMPIExecutor +from libensemble.executors import NewBalsamExecutor from balsam.api import ApplicationDefinition # Use Globus to transfer output forces.stat files back @@ -39,7 +39,7 @@ class RemoteForces(ApplicationDefinition): } -exctr = NewBalsamMPIExecutor() +exctr = NewBalsamExecutor() exctr.register_app(RemoteForces, app_name="forces") batch = exctr.submit_allocation( From 97b74b1d1dba03c79832faf7f26f1b671be4cd35 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 1 Mar 2022 17:38:45 -0600 Subject: [PATCH 35/93] flake8 --- .../tests/scaling_tests/balsam_forces/forces_simf.py | 6 +----- .../scaling_tests/balsam_forces/run_libe_forces_balsam.py | 3 ++- libensemble/tests/scaling_tests/forces/run_libe_forces.py | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index a6534be9c..b8b78c3fd 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -6,7 +6,7 @@ def run_forces_balsam(H, persis_info, sim_specs, libE_info): import numpy as np from libensemble.executors.executor import Executor - from libensemble.message_numbers import WORKER_DONE, WORKER_KILL, TASK_FAILED + from libensemble.message_numbers import WORKER_DONE, TASK_FAILED def perturb(particles, seed, max_fraction): MAX_SEED = 32767 @@ -73,10 +73,6 @@ def read_last_line(filepath): workdir=workdir, ) - # Stat file to check for bad runs - statfile = "forces.stat" - line = None - poll_interval = 2 # secs print("Beginning to poll Task {}".format(task.name)) while not task.finished: diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 80a68589f..832668410 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -18,7 +18,8 @@ ) forces.persis_info.add_random_streams() -# + + class RemoteForces(ApplicationDefinition): site = "jln_theta" command_template = ( diff --git a/libensemble/tests/scaling_tests/forces/run_libe_forces.py b/libensemble/tests/scaling_tests/forces/run_libe_forces.py index ad3cf699b..6fa64a2e8 100644 --- a/libensemble/tests/scaling_tests/forces/run_libe_forces.py +++ b/libensemble/tests/scaling_tests/forces/run_libe_forces.py @@ -9,6 +9,7 @@ from libensemble.tools import parse_args, save_libE_output, add_unique_random_streams from libensemble import logger from forces_support import test_libe_stats, test_ensemble_dir, check_log_exception +from libensemble.executors.mpi_executor import MPIExecutor PERSIS_GEN = False @@ -36,7 +37,6 @@ subprocess.check_call(['./build_forces.sh']) -from libensemble.executors.mpi_executor import MPIExecutor exctr = MPIExecutor() exctr.register_app(full_path=sim_app, app_name='forces') From 86404ff1237d8205979f7c702c5964fa0bcca769 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 1 Mar 2022 17:48:23 -0600 Subject: [PATCH 36/93] fix css and conf.py? --- docs/_static/custom.css | 16 ---------------- docs/conf.py | 2 +- 2 files changed, 1 insertion(+), 17 deletions(-) delete mode 100644 docs/_static/custom.css diff --git a/docs/_static/custom.css b/docs/_static/custom.css deleted file mode 100644 index f2c020405..000000000 --- a/docs/_static/custom.css +++ /dev/null @@ -1,16 +0,0 @@ -.wy-nav-content { -max-width: 850px !important; -} - -.toggle .header { - display: block; - clear: both; -} - -.toggle .header:after { - content: " ▶"; -} - -.toggle .header.open:after { - content: " ▼"; -} diff --git a/docs/conf.py b/docs/conf.py index bc258e4c4..b5319c5d5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -198,7 +198,7 @@ def __getattr__(cls, name): # html_static_path = [] def setup(app): - app.add_css_file('custom.css') + app.add_css_file('my_theme.css') # Custom sidebar templates, must be a dictionary that maps document names # to template names. From 576d009372ccf9f45759102c763a56f54dd95c67 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 2 Mar 2022 10:12:17 -0600 Subject: [PATCH 37/93] fix balsam import condition --- libensemble/executors/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libensemble/executors/__init__.py b/libensemble/executors/__init__.py index aab130004..83d94534c 100644 --- a/libensemble/executors/__init__.py +++ b/libensemble/executors/__init__.py @@ -1,10 +1,10 @@ from libensemble.executors.executor import Executor from libensemble.executors.mpi_executor import MPIExecutor -from libensemble.executors.new_balsam_executor import NewBalsamExecutor -import os -import sys -if 'BALSAM_DB_PATH' in os.environ and int(sys.version[2]) >= 6: +import pkg_resources + +if pkg_resources.get_distribution('balsam-flow'): from libensemble.executors.balsam_executor import BalsamMPIExecutor + from libensemble.executors.new_balsam_executor import NewBalsamExecutor __all__ = ['BalsamMPIExecutor', 'Executor', 'MPIExecutor', 'NewBalsamExecutor'] From 441c4a8d3bf951ea67d09edb54e97ae62a5f76f7 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 2 Mar 2022 11:00:20 -0600 Subject: [PATCH 38/93] probably don't import both, especially during CI? --- libensemble/executors/__init__.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/libensemble/executors/__init__.py b/libensemble/executors/__init__.py index 83d94534c..57f675eb0 100644 --- a/libensemble/executors/__init__.py +++ b/libensemble/executors/__init__.py @@ -1,10 +1,17 @@ from libensemble.executors.executor import Executor from libensemble.executors.mpi_executor import MPIExecutor +import os import pkg_resources if pkg_resources.get_distribution('balsam-flow'): - from libensemble.executors.balsam_executor import BalsamMPIExecutor - from libensemble.executors.new_balsam_executor import NewBalsamExecutor + try: + if 'BALSAM_DB_PATH' in os.environ: + from libensemble.executors.balsam_executor import BalsamMPIExecutor + else: + from libensemble.executors.new_balsam_executor import NewBalsamExecutor + + except ModuleNotFoundError: # One version of Balsam installed, but not the other + pass __all__ = ['BalsamMPIExecutor', 'Executor', 'MPIExecutor', 'NewBalsamExecutor'] From 40085c4fc2760934714f8fe54c99b238ea91ea8d Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 2 Mar 2022 11:13:35 -0600 Subject: [PATCH 39/93] yet again rearrange pkg_resources logic --- libensemble/executors/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libensemble/executors/__init__.py b/libensemble/executors/__init__.py index 57f675eb0..704d8db2a 100644 --- a/libensemble/executors/__init__.py +++ b/libensemble/executors/__init__.py @@ -4,14 +4,14 @@ import os import pkg_resources -if pkg_resources.get_distribution('balsam-flow'): - try: +try: + if pkg_resources.get_distribution('balsam-flow'): if 'BALSAM_DB_PATH' in os.environ: from libensemble.executors.balsam_executor import BalsamMPIExecutor else: from libensemble.executors.new_balsam_executor import NewBalsamExecutor - except ModuleNotFoundError: # One version of Balsam installed, but not the other - pass +except ModuleNotFoundError: # One version of Balsam installed, but not the other + pass __all__ = ['BalsamMPIExecutor', 'Executor', 'MPIExecutor', 'NewBalsamExecutor'] From c8ec0591c179835931d10e4b0588e4a04c5dfe6e Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 2 Mar 2022 11:31:13 -0600 Subject: [PATCH 40/93] catch DistributionNotFound --- libensemble/executors/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/executors/__init__.py b/libensemble/executors/__init__.py index 704d8db2a..4ca9b0f5e 100644 --- a/libensemble/executors/__init__.py +++ b/libensemble/executors/__init__.py @@ -11,7 +11,7 @@ else: from libensemble.executors.new_balsam_executor import NewBalsamExecutor -except ModuleNotFoundError: # One version of Balsam installed, but not the other +except (ModuleNotFoundError, pkg_resources.DistributionNotFound): # One version of Balsam installed, but not the other pass __all__ = ['BalsamMPIExecutor', 'Executor', 'MPIExecutor', 'NewBalsamExecutor'] From 7fa44dfd744d9443be122eb48fffa592838f7b64 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Wed, 2 Mar 2022 15:24:11 -0600 Subject: [PATCH 41/93] Simple black --- libensemble/executors/new_balsam_executor.py | 24 +++++--------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 11cb06b80..882729c53 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -64,9 +64,7 @@ def __init__( def _get_time_since_balsam_submit(self): """Return time since balsam task entered RUNNING state""" - event_query = EventLog.objects.filter( - job_id=self.process.id, to_state="RUNNING" - ) + event_query = EventLog.objects.filter(job_id=self.process.id, to_state="RUNNING") if not len(event_query): return 0 balsam_launch_datetime = event_query[0].timestamp @@ -110,10 +108,7 @@ def _set_complete(self, dry_run=False): elif balsam_state in STATES: # In my states self.state = balsam_state else: - logger.warning( - "Task finished, but in unrecognized " - "Balsam state {}".format(balsam_state) - ) + logger.warning("Task finished, but in unrecognized " "Balsam state {}".format(balsam_state)) self.state = "UNKNOWN" logger.info("Task {} ended with state {}".format(self.name, self.state)) @@ -219,9 +214,7 @@ def __init__(self, custom_info={}): """ if custom_info: - logger.warning( - "The Balsam executor does not support custom_info - ignoring" - ) + logger.warning("The Balsam executor does not support custom_info - ignoring") super().__init__(custom_info) @@ -459,9 +452,7 @@ def submit( ) if not len(self.allocations): - logger.warning( - "Balsam Job submitted with no active BatchJobs! Initialize a matching BatchJob." - ) + logger.warning("Balsam Job submitted with no active BatchJobs! Initialize a matching BatchJob.") task = NewBalsamTask(app, app_args, workdir, None, None, self.workerID) @@ -491,13 +482,10 @@ def submit( if not task.timer.timing: task.timer.start() - task.submit_time = ( - task.timer.tstart - ) # Time not date - may not need if using timer. + task.submit_time = task.timer.tstart # Time not date - may not need if using timer. logger.info( - "Submitted Balsam App to site {}: " - "nodes {} ppn {}".format(App.site, num_nodes, procs_per_node) + "Submitted Balsam App to site {}: " "nodes {} ppn {}".format(App.site, num_nodes, procs_per_node) ) # task.workdir = task.process.working_directory # Might not be set yet! From 635d9e71424fbec8f0e494e4335952007175772b Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Wed, 2 Mar 2022 15:25:11 -0600 Subject: [PATCH 42/93] Simple black --- .../tests/scaling_tests/balsam_forces/forces_simf.py | 8 ++------ .../scaling_tests/balsam_forces/run_libe_forces_balsam.py | 4 +--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index b8b78c3fd..7d9651174 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -85,14 +85,10 @@ def read_last_line(filepath): print("Task {} exited with state {}.".format(task.name, task.state)) if TRANSFER_STATFILES: print("Waiting for Task {} statfile.".format(task.name)) - while file_dest not in [ - os.path.join(os.getcwd(), i) for i in os.listdir(".") - ]: + while file_dest not in [os.path.join(os.getcwd(), i) for i in os.listdir(".")]: time.sleep(1) if read_last_line(file_dest) == "kill": - print( - "Warning: Task completed although marked as a bad run (kill flag set in retrieved forces.stat)" - ) + print("Warning: Task completed although marked as a bad run (kill flag set in retrieved forces.stat)") calc_status = TASK_FAILED else: calc_status = WORKER_DONE diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 832668410..08bf8bc5c 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -13,9 +13,7 @@ forces.from_yaml("balsam_forces.yaml") forces.gen_specs["user"].update({"lb": np.array([0]), "ub": np.array([32767])}) -forces.sim_specs["user"].update( - {"transfer": TRANSFER_STATFILES, "globus_endpoint": GLOBUS_ENDPOINT} -) +forces.sim_specs["user"].update({"transfer": TRANSFER_STATFILES, "globus_endpoint": GLOBUS_ENDPOINT}) forces.persis_info.add_random_streams() From 90ab921128eb028b93e712de106c7d614d187b70 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Wed, 2 Mar 2022 15:27:30 -0600 Subject: [PATCH 43/93] Spell --- libensemble/executors/new_balsam_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 882729c53..5085d126a 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -38,7 +38,7 @@ class NewBalsamTask(Task): The same attributes and query routines are implemented. Use ``task.process`` to refer to the matching Balsam Job initialized by the NewBalsamExecutor, - with every Balsam Job method invokable on it. Otherwise, libEnsemble task methods + with every Balsam Job method invocable on it. Otherwise, libEnsemble task methods like ``poll()`` can be used directly. """ From 219698520f1a5752e5481d978dc8e78ebca49b26 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 2 Mar 2022 15:44:35 -0600 Subject: [PATCH 44/93] large refactorings of docs, refactor balsam-submission routine to define apps and initialize libensemble separately. Balsam App launches run_libe_forces_balsam.py --- docs/executor/balsam2_executor.rst | 14 +++++ docs/executor/balsam_executor.rst | 20 +----- docs/executor/ex_index.rst | 5 +- docs/executor/executor.rst | 5 +- docs/executor/mpi_executor.rst | 4 +- libensemble/executors/new_balsam_executor.py | 9 +-- .../balsam_forces/define_balsam_apps_run.py | 63 +++++++++++++++++++ .../balsam_forces/run_libe_forces_balsam.py | 43 +++++-------- 8 files changed, 103 insertions(+), 60 deletions(-) create mode 100644 docs/executor/balsam2_executor.rst create mode 100644 libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py diff --git a/docs/executor/balsam2_executor.rst b/docs/executor/balsam2_executor.rst new file mode 100644 index 000000000..16ca4a917 --- /dev/null +++ b/docs/executor/balsam2_executor.rst @@ -0,0 +1,14 @@ +Balsam 2 Executor - Remote apps +=============================== + +.. automodule:: new_balsam_executor + :no-undoc-members: + +.. autoclass:: NewBalsamExecutor + :show-inheritance: + :members: __init__, register_app, submit_allocation, revoke_allocation, submit + +.. autoclass:: NewBalsamTask + :show-inheritance: + :member-order: bysource + :members: poll, wait, kill diff --git a/docs/executor/balsam_executor.rst b/docs/executor/balsam_executor.rst index 8f1de6930..4e961ba21 100644 --- a/docs/executor/balsam_executor.rst +++ b/docs/executor/balsam_executor.rst @@ -1,23 +1,5 @@ -Balsam Executors -================ - -Balsam 2 Executor ------------------ - -.. automodule:: new_balsam_executor - :no-undoc-members: - -.. autoclass:: NewBalsamExecutor - :show-inheritance: - :members: __init__, register_app, submit_allocation, revoke_allocation, submit - -.. autoclass:: NewBalsamTask - :show-inheritance: - :member-order: bysource - :members: poll, wait, kill - Balsam 1 MPI Executor ---------------------- +===================== .. automodule:: balsam_executor :no-undoc-members: diff --git a/docs/executor/ex_index.rst b/docs/executor/ex_index.rst index 0ba710537..beae83d57 100644 --- a/docs/executor/ex_index.rst +++ b/docs/executor/ex_index.rst @@ -9,7 +9,10 @@ portable interface for running and managing user applications. .. toctree:: :maxdepth: 2 :titlesonly: - :caption: libEnsemble Executor: + :caption: libEnsemble Executors: overview executor + mpi_executor + balsam_executor + balsam2_executor diff --git a/docs/executor/executor.rst b/docs/executor/executor.rst index 2e3fd9dae..3f7b3de50 100644 --- a/docs/executor/executor.rst +++ b/docs/executor/executor.rst @@ -1,5 +1,5 @@ -Executor Modules -================ +Base Executor - Local apps +========================== .. automodule:: executor :no-undoc-members: @@ -14,6 +14,7 @@ See the Executor APIs for optional arguments. mpi_executor balsam_executor + balsam2_executor Executor Class --------------- diff --git a/docs/executor/mpi_executor.rst b/docs/executor/mpi_executor.rst index bf95c9884..60fc1cc78 100644 --- a/docs/executor/mpi_executor.rst +++ b/docs/executor/mpi_executor.rst @@ -1,5 +1,5 @@ -MPI Executor -============ +MPI Executor - MPI apps +======================= .. automodule:: mpi_executor :no-undoc-members: diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 11cb06b80..9d128f9c8 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -211,19 +211,14 @@ class NewBalsamExecutor(Executor): """ - def __init__(self, custom_info={}): + def __init__(self): """Instantiate a new BalsamMPIExecutor instance. A new BalsamMPIExecutor object is created with an application registry and configuration attributes """ - if custom_info: - logger.warning( - "The Balsam executor does not support custom_info - ignoring" - ) - - super().__init__(custom_info) + super().__init__() self.workflow_name = "libe_workflow" self.allocations = [] diff --git a/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py b/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py new file mode 100644 index 000000000..083e59c2f --- /dev/null +++ b/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py @@ -0,0 +1,63 @@ +from balsam.api import ApplicationDefinition, BatchJob + + +class RemoteForces(ApplicationDefinition): + site = "jln_theta" + command_template = ( + "/home/jnavarro" + + "/libensemble/libensemble/tests/scaling_tests/forces/forces.x" + + " {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}" + + " > out.txt 2>&1" + ) + + transfers = { + "result": { + "required": False, + "direction": "out", + "local_path": "forces.stat", + "description": "Forces stat file", + "recursive": False, + } + } + +print("Defined RemoteForces Balsam ApplicationDefinition.") + +class LibensembleApp(ApplicationDefinition): + site = "jln_theta" + command_template = ( + "/home/jnavarro/.conda/envs/again/bin/python /home/jnavarro" + + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py" + + " > libe_out.txt 2>&1" + ) + + transfers = { + "input_file": { + "required": True, + "direction": "in", + "local_path": ".", + "description": "Transfer in of balsam_forces.yaml", + "recursive": False, + } + } + +print("Defined LibensembleApp Balsam ApplicationDefinition.") + +libe_job = LibensembleApp.submit( + workdir="libe_workflow/libe_processes", + transfers={ + "input_file": "theta_dtn:/home/jnavarro/libensemble/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml" + }, +) + +print("libEnsemble Job created.") + +BatchJob.objects.create( + site_id=libe_job.site_id, + num_nodes=4, + wall_time_min=60, + job_mode="mpi", + project="CSC250STMS07", + queue="debug-flat-quad", +) + +print("BatchJob session initialized.") diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 832668410..8eccfb76d 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -6,7 +6,8 @@ from balsam.api import ApplicationDefinition # Use Globus to transfer output forces.stat files back -TRANSFER_STATFILES = True +ON_THETA = True +TRANSFER_STATFILES = False GLOBUS_ENDPOINT = "jln_laptop" forces = Ensemble() @@ -19,38 +20,22 @@ forces.persis_info.add_random_streams() - -class RemoteForces(ApplicationDefinition): - site = "jln_theta" - command_template = ( - "/home/jnavarro" - + "/libensemble/libensemble/tests/scaling_tests/forces/forces.x" - + " {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}" - + " > out.txt 2>&1" - ) - - transfers = { - "result": { - "required": False, - "direction": "out", - "local_path": "forces.stat", - "description": "Forces stat file", - "recursive": False, - } - } - +apps = ApplicationDefinition.load_by_site["jln_theta"] +RemoteForces = apps["RemoteForces"] exctr = NewBalsamExecutor() exctr.register_app(RemoteForces, app_name="forces") -batch = exctr.submit_allocation( - site_id=246, - num_nodes=4, - wall_time_min=30, - queue="debug-flat-quad", - project="CSC250STMS07", -) +if not ON_THETA: + batch = exctr.submit_allocation( + site_id=246, + num_nodes=4, + wall_time_min=30, + queue="debug-flat-quad", + project="CSC250STMS07", + ) forces.run() -exctr.revoke_allocation(batch) +if not ON_THETA: + exctr.revoke_allocation(batch) From e3404b753d49a1661b21780f9954cf45f1e65709 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 2 Mar 2022 15:46:54 -0600 Subject: [PATCH 45/93] flake8 --- .../scaling_tests/balsam_forces/define_balsam_apps_run.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py b/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py index 083e59c2f..5f45a1780 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py +++ b/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py @@ -20,8 +20,10 @@ class RemoteForces(ApplicationDefinition): } } + print("Defined RemoteForces Balsam ApplicationDefinition.") + class LibensembleApp(ApplicationDefinition): site = "jln_theta" command_template = ( @@ -40,12 +42,15 @@ class LibensembleApp(ApplicationDefinition): } } + print("Defined LibensembleApp Balsam ApplicationDefinition.") +input_file = "theta_dtn:/home/jnavarro/libensemble/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml" + libe_job = LibensembleApp.submit( workdir="libe_workflow/libe_processes", transfers={ - "input_file": "theta_dtn:/home/jnavarro/libensemble/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml" + "input_file": input_file }, ) From a8a1d34026e92718fd3d7fdac5c478b7fe462049 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 2 Mar 2022 15:59:16 -0600 Subject: [PATCH 46/93] fix load_by_site --- .../tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 2e85691fc..dbf4d2c97 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -18,7 +18,7 @@ forces.persis_info.add_random_streams() -apps = ApplicationDefinition.load_by_site["jln_theta"] +apps = ApplicationDefinition.load_by_site("jln_theta") RemoteForces = apps["RemoteForces"] exctr = NewBalsamExecutor() From 85a41471d7574d2dc8900d9158bfc0a414bec388 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 2 Mar 2022 16:15:37 -0600 Subject: [PATCH 47/93] still need to specify MPI ranks for libensemble job, bump sim_max --- .../tests/scaling_tests/balsam_forces/balsam_forces.yaml | 2 +- .../scaling_tests/balsam_forces/define_balsam_apps_run.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml index b3b825f0f..cc1db2d8d 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml +++ b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml @@ -2,7 +2,7 @@ libE_specs: save_every_k_gens: 1000 profile: False exit_criteria: - sim_max: 8 + sim_max: 16 sim_specs: function: libensemble.tests.scaling_tests.balsam_forces.forces_simf.run_forces_balsam diff --git a/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py b/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py index 5f45a1780..9859714f5 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py +++ b/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py @@ -49,16 +49,16 @@ class LibensembleApp(ApplicationDefinition): libe_job = LibensembleApp.submit( workdir="libe_workflow/libe_processes", - transfers={ - "input_file": input_file - }, + num_nodes=1, + ranks_per_node=5, + transfers={"input_file": input_file}, ) print("libEnsemble Job created.") BatchJob.objects.create( site_id=libe_job.site_id, - num_nodes=4, + num_nodes=5, wall_time_min=60, job_mode="mpi", project="CSC250STMS07", From 6cbac2c03fbaea54b3e43abbdbce92e3a032184c Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 2 Mar 2022 16:34:02 -0600 Subject: [PATCH 48/93] pass on OSError on attempt to sync Balsam app (probably no access to app source) --- libensemble/executors/new_balsam_executor.py | 10 ++++++---- .../balsam_forces/define_balsam_apps_run.py | 5 +++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py index 3e59a88f4..97dfe09e7 100644 --- a/libensemble/executors/new_balsam_executor.py +++ b/libensemble/executors/new_balsam_executor.py @@ -448,9 +448,6 @@ def submit( "No procs/nodes provided - aborting", ) - if not len(self.allocations): - logger.warning("Balsam Job submitted with no active BatchJobs! Initialize a matching BatchJob.") - task = NewBalsamTask(app, app_args, workdir, None, None, self.workerID) if dry_run: @@ -459,7 +456,12 @@ def submit( task._set_complete(dry_run=True) else: App = app.pyobj - App.sync() + + try: + App.sync() # if App source-code available, send to Balsam service + except OSError: + pass # App retrieved from Balsam service, assume no access to source-code + task.process = Job( app_id=App, workdir=workdir, diff --git a/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py b/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py index 9859714f5..a5d449f30 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py +++ b/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py @@ -20,8 +20,9 @@ class RemoteForces(ApplicationDefinition): } } +RemoteForces.sync() -print("Defined RemoteForces Balsam ApplicationDefinition.") +print("Defined and synced RemoteForces Balsam ApplicationDefinition.") class LibensembleApp(ApplicationDefinition): @@ -54,7 +55,7 @@ class LibensembleApp(ApplicationDefinition): transfers={"input_file": input_file}, ) -print("libEnsemble Job created.") +print("libEnsemble Job created, synced with Balsam.") BatchJob.objects.create( site_id=libe_job.site_id, From b2277748957a1f6e130c26e96b7ceab18e05fcb0 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 3 Mar 2022 10:08:50 -0600 Subject: [PATCH 49/93] fix globus destination directory --- .../balsam_forces/define_balsam_apps_run.py | 6 +++++- .../balsam_forces/forces_simf.py | 15 +++++++++----- .../balsam_forces/run_libe_forces_balsam.py | 20 ++++++++++++++----- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py b/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py index a5d449f30..02067d10e 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py +++ b/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py @@ -20,6 +20,7 @@ class RemoteForces(ApplicationDefinition): } } + RemoteForces.sync() print("Defined and synced RemoteForces Balsam ApplicationDefinition.") @@ -46,7 +47,10 @@ class LibensembleApp(ApplicationDefinition): print("Defined LibensembleApp Balsam ApplicationDefinition.") -input_file = "theta_dtn:/home/jnavarro/libensemble/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml" +input_file = ( + "jln_laptop:/Users/jnavarro/Desktop/libensemble" + + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml" +) libe_job = LibensembleApp.submit( workdir="libe_workflow/libe_processes", diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index 7d9651174..94e264ba2 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -35,7 +35,8 @@ def read_last_line(filepath): sim_particles = sim_specs["user"]["sim_particles"] sim_timesteps = sim_specs["user"]["sim_timesteps"] TRANSFER_STATFILES = sim_specs["user"]["transfer"] - globus_endpoint = sim_specs["user"]["globus_endpoint"] + GLOBUS_ENDPOINT = sim_specs["user"]["globus_endpoint"] + GLOBUS_DEST_DIR = sim_specs["user"]["globus_dest_dir"] # Get from dictionary if key exists, else return default (e.g. 0) kill_rate = sim_specs["user"].get("kill_rate", 0) @@ -56,9 +57,9 @@ def read_last_line(filepath): } workdir = "worker" + str(libE_info["workerID"]) + "_" + secrets.token_hex(nbytes=3) - file_dest = os.getcwd() + "/forces_" + secrets.token_hex(nbytes=3) + ".stat" + file_dest = GLOBUS_DEST_DIR + "/forces_" + secrets.token_hex(nbytes=3) + ".stat" if TRANSFER_STATFILES: - transfer = {"result": globus_endpoint + ":" + file_dest} + transfer = {"result": GLOBUS_ENDPOINT + ":" + file_dest} else: transfer = {} @@ -85,10 +86,14 @@ def read_last_line(filepath): print("Task {} exited with state {}.".format(task.name, task.state)) if TRANSFER_STATFILES: print("Waiting for Task {} statfile.".format(task.name)) - while file_dest not in [os.path.join(os.getcwd(), i) for i in os.listdir(".")]: + while file_dest not in [ + os.path.join(os.getcwd(), i) for i in os.listdir(".") + ]: time.sleep(1) if read_last_line(file_dest) == "kill": - print("Warning: Task completed although marked as a bad run (kill flag set in retrieved forces.stat)") + print( + "Warning: Task completed although marked as a bad run (kill flag set in retrieved forces.stat)" + ) calc_status = TASK_FAILED else: calc_status = WORKER_DONE diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index dbf4d2c97..c873a770a 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -6,15 +6,25 @@ from balsam.api import ApplicationDefinition # Use Globus to transfer output forces.stat files back -ON_THETA = True -TRANSFER_STATFILES = False +THIS_SCRIPT_ON_THETA = True +TRANSFER_STATFILES = True GLOBUS_ENDPOINT = "jln_laptop" +GLOBUS_DEST_DIR = ( + "/Users/jnavarro/Desktop/libensemble" + + "/libensemble/libensemble/tests/scaling_tests/balsam_forces" +) forces = Ensemble() forces.from_yaml("balsam_forces.yaml") forces.gen_specs["user"].update({"lb": np.array([0]), "ub": np.array([32767])}) -forces.sim_specs["user"].update({"transfer": TRANSFER_STATFILES, "globus_endpoint": GLOBUS_ENDPOINT}) +forces.sim_specs["user"].update( + { + "transfer": TRANSFER_STATFILES, + "globus_endpoint": GLOBUS_ENDPOINT, + "globus_dest_dir": GLOBUS_DEST_DIR, + } +) forces.persis_info.add_random_streams() @@ -24,7 +34,7 @@ exctr = NewBalsamExecutor() exctr.register_app(RemoteForces, app_name="forces") -if not ON_THETA: +if not THIS_SCRIPT_ON_THETA: batch = exctr.submit_allocation( site_id=246, num_nodes=4, @@ -35,5 +45,5 @@ forces.run() -if not ON_THETA: +if not THIS_SCRIPT_ON_THETA: exctr.revoke_allocation(batch) From d6e5511681bd4b068b2a77a720f9cd907f333a7e Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 3 Mar 2022 10:21:05 -0600 Subject: [PATCH 50/93] attempt to cancel BatchJob once all stat files returned --- .../balsam_forces/define_balsam_apps_run.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py b/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py index 02067d10e..ff96bd8a2 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py +++ b/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py @@ -1,5 +1,9 @@ +import time +import glob from balsam.api import ApplicationDefinition, BatchJob +SIM_MAX = 16 # make sure matches in balsam_forces.yaml + class RemoteForces(ApplicationDefinition): site = "jln_theta" @@ -61,7 +65,7 @@ class LibensembleApp(ApplicationDefinition): print("libEnsemble Job created, synced with Balsam.") -BatchJob.objects.create( +batch = BatchJob.objects.create( site_id=libe_job.site_id, num_nodes=5, wall_time_min=60, @@ -71,3 +75,14 @@ class LibensembleApp(ApplicationDefinition): ) print("BatchJob session initialized.") +print("Waiting for all returned forces.stat files...") + +while len(glob.glob("./*.stat")) != SIM_MAX: + time.sleep(3) + +print("All forces.stat files returned. Cancelling BatchJob session.") + +batch.state = "pending_deletion" +batch.save() + +print("BatchJob session cancelled. Success!") From e206ef8ea2a6f4a3df348c1006bbd19660485aac Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 3 Mar 2022 10:57:52 -0600 Subject: [PATCH 51/93] attempt to improve forces.stat eval logic, considering machine/if transfer occurring --- .../balsam_forces/forces_simf.py | 36 +++++++++++++------ .../balsam_forces/run_libe_forces_balsam.py | 1 + 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index 94e264ba2..fe21c59a9 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -37,6 +37,7 @@ def read_last_line(filepath): TRANSFER_STATFILES = sim_specs["user"]["transfer"] GLOBUS_ENDPOINT = sim_specs["user"]["globus_endpoint"] GLOBUS_DEST_DIR = sim_specs["user"]["globus_dest_dir"] + THIS_SCRIPT_ON_THETA = sim_specs["user"]["this_script_on_theta"] # Get from dictionary if key exists, else return default (e.g. 0) kill_rate = sim_specs["user"].get("kill_rate", 0) @@ -84,23 +85,36 @@ def read_last_line(filepath): if task.state in ["FINISHED", "FAILED"]: print("Task {} exited with state {}.".format(task.name, task.state)) - if TRANSFER_STATFILES: - print("Waiting for Task {} statfile.".format(task.name)) - while file_dest not in [ - os.path.join(os.getcwd(), i) for i in os.listdir(".") - ]: - time.sleep(1) - if read_last_line(file_dest) == "kill": + if THIS_SCRIPT_ON_THETA: + statfile = "../" + workdir + "/" + file_dest.split("/")[-1] + if read_last_line(statfile) == "kill": print( - "Warning: Task completed although marked as a bad run (kill flag set in retrieved forces.stat)" + "Warning: Task completed although marked as a bad run (kill flag set in forces.stat)" ) calc_status = TASK_FAILED else: calc_status = WORKER_DONE - print("Task completed successfully. forces.stat retrieved.") + print("Task completed successfully.") + else: - calc_status = WORKER_DONE - print("Task completed.") + if TRANSFER_STATFILES: + print("Waiting for Task {} statfile.".format(task.name)) + while file_dest not in [ + os.path.join(os.getcwd(), i) for i in os.listdir(".") + ]: + time.sleep(1) + + if read_last_line(file_dest) == "kill": + print( + "Warning: Task completed although marked as a bad run (kill flag set in retrieved forces.stat)" + ) + calc_status = TASK_FAILED + else: + calc_status = WORKER_DONE + print("Task completed successfully. forces.stat retrieved.") + else: + calc_status = WORKER_DONE + print("Task completed.") else: print(task.state) diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index c873a770a..d67987d92 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -23,6 +23,7 @@ "transfer": TRANSFER_STATFILES, "globus_endpoint": GLOBUS_ENDPOINT, "globus_dest_dir": GLOBUS_DEST_DIR, + "this_script_on_theta": THIS_SCRIPT_ON_THETA, } ) From fba4c6c20d744673748b1bcfda541636449f7b1e Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 3 Mar 2022 11:06:27 -0600 Subject: [PATCH 52/93] fix np.read of statfile for each run dest --- .../scaling_tests/balsam_forces/forces_simf.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index fe21c59a9..ad9cd0cf1 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -96,6 +96,12 @@ def read_last_line(filepath): calc_status = WORKER_DONE print("Task completed successfully.") + try: + data = np.loadtxt(statfile) + final_energy = data[-1] + except Exception: + final_energy = np.nan + else: if TRANSFER_STATFILES: print("Waiting for Task {} statfile.".format(task.name)) @@ -112,18 +118,18 @@ def read_last_line(filepath): else: calc_status = WORKER_DONE print("Task completed successfully. forces.stat retrieved.") + + try: + data = np.loadtxt(file_dest) + final_energy = data[-1] + except Exception: + final_energy = np.nan else: calc_status = WORKER_DONE print("Task completed.") else: print(task.state) - time.sleep(0.2) - try: - data = np.loadtxt(file_dest) - final_energy = data[-1] - except Exception: - final_energy = np.nan outspecs = sim_specs["out"] output = np.zeros(1, dtype=outspecs) From 7de8549f22d7347289af58298ed26e2dfac16e38 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 3 Mar 2022 11:15:14 -0600 Subject: [PATCH 53/93] fix syntax --- libensemble/tests/scaling_tests/balsam_forces/forces_simf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index ad9cd0cf1..b62acc206 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -123,14 +123,13 @@ def read_last_line(filepath): data = np.loadtxt(file_dest) final_energy = data[-1] except Exception: - final_energy = np.nan + final_energy = np.nan else: calc_status = WORKER_DONE print("Task completed.") else: print(task.state) - outspecs = sim_specs["out"] output = np.zeros(1, dtype=outspecs) output["energy"][0] = final_energy From afc2cf15ef8d1577e0c360928fc234c438bbbfe4 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 7 Mar 2022 12:15:28 -0600 Subject: [PATCH 54/93] rename submission script, add section on submitting libe as balsam app to readme --- .../scaling_tests/balsam_forces/readme.md | 38 ++++++ .../balsam_forces/run_libe_forces_balsam.py | 3 +- ...ps_run.py => submit_libe_forces_balsam.py} | 109 +++++++++++------- 3 files changed, 105 insertions(+), 45 deletions(-) rename libensemble/tests/scaling_tests/balsam_forces/{define_balsam_apps_run.py => submit_libe_forces_balsam.py} (57%) diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/balsam_forces/readme.md index 13d2a0c9f..3564f508e 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/readme.md +++ b/libensemble/tests/scaling_tests/balsam_forces/readme.md @@ -65,6 +65,7 @@ Application parameters can be adjusted in `funcx_forces.yaml`. Note that each function and path must be accessible and/or importable on the remote machine. Absolute paths are recommended. +**This runs libEnsemble itself in-place, with only forces submitted to a Balsam site.** To remove output before the next run, use: @@ -94,3 +95,40 @@ This should be sufficient for ``forces.stat`` files from remote Balsam app runs to be transferred back to your local launch directory after every app run. The simulation function will wait for Balsam to transfer back a stat file, then determine the calc status based on the received output. + +### (Optional) Running libEnsemble as a Balsam app on compute nodes + +The previous instructions for running libEnsemble are understandably insufficient +if running with potentially hundreds of workers or if the simulation/generation +functions are computationally expensive. + +The included ``submit_libe_forces_balsam.py`` script will submit libEnsemble itself +as a Balsam Job, to be run by a Balsam site on the compute nodes. From there libEnsemble's +simulation function will behave as before, submitting forces apps to Balsam for scheduling +on the same allocation. + +Since Balsam's API can initiate allocations for a given Balsam site remotely, +``submit_libe_forces_balsam.py`` behaves like a batch submission script except +it can be run from *anywhere* and still initiate a session on Theta. This does mean +that any input files still need to be transferred by Globus to be accessible by +libEnsemble running on the compute nodes. Customize the ``input_file`` dictionary +according to Balsam's Globus specifications to do this (see the previous section). + +The following parameters can be adjusted at the top of this script: + + SIM_MAX = 16 # make sure matches in balsam_forces.yaml + BATCH_NUM_NODES = 5 + BATCH_WALL_CLOCK_TIME = 60 + PROJECT = "CSC250STMS07" + QUEUE = "debug-flat-quad" + + # libE Job Parameters - Will use above resources + LIBE_NODES = 1 + LIBE_RANKS = 5 + +**Adjust each of the literal sites, directories, paths and other attributes** +in each of the ``ApplicationDefinition`` instances. If transferring statfiles, +this script can wait for a number of statfiles equal to ``sim_max`` to be returned, +then cancel the remote BatchJob. For this script, set ``TRANSFER_STATFILES`` to ``True.`` +The calling script will also need to be updated to contain the correct Globus endpoint +and destination directory for the transfers. diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index d67987d92..b5c06e909 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -5,8 +5,9 @@ from libensemble.executors import NewBalsamExecutor from balsam.api import ApplicationDefinition +THIS_SCRIPT_ON_THETA = True # Is this running on a personal machine, or a compute node? + # Use Globus to transfer output forces.stat files back -THIS_SCRIPT_ON_THETA = True TRANSFER_STATFILES = True GLOBUS_ENDPOINT = "jln_laptop" GLOBUS_DEST_DIR = ( diff --git a/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py similarity index 57% rename from libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py rename to libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py index ff96bd8a2..15deef330 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/define_balsam_apps_run.py +++ b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py @@ -2,32 +2,31 @@ import glob from balsam.api import ApplicationDefinition, BatchJob +# Batch Session Parameters SIM_MAX = 16 # make sure matches in balsam_forces.yaml +BATCH_NUM_NODES = 5 +BATCH_WALL_CLOCK_TIME = 60 +PROJECT = "CSC250STMS07" +QUEUE = "debug-flat-quad" + +# libE Job Parameters - Will use above resources +LIBE_NODES = 1 +LIBE_RANKS = 5 + +# Transfer forces.stat files back to this script's source directory? +# Adjust run_libe_forces_balsam.py as well!!!! +TRANSFER_STATFILES = True + +# Transfer this file to the libE Job's working directory. +# # globus_endpoint_key *specified in local balsam site's settings.yml* +# globus_endpoint_key:/path/to/file +input_file = ( + "jln_laptop:/Users/jnavarro/Desktop/libensemble" + + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml" +) - -class RemoteForces(ApplicationDefinition): - site = "jln_theta" - command_template = ( - "/home/jnavarro" - + "/libensemble/libensemble/tests/scaling_tests/forces/forces.x" - + " {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}" - + " > out.txt 2>&1" - ) - - transfers = { - "result": { - "required": False, - "direction": "out", - "local_path": "forces.stat", - "description": "Forces stat file", - "recursive": False, - } - } - - -RemoteForces.sync() - -print("Defined and synced RemoteForces Balsam ApplicationDefinition.") +# FOR EACH OF THE FOLLOWING APPS, make sure Balsam sites, home directories, +# pythons, and other paths are updated. class LibensembleApp(ApplicationDefinition): @@ -51,38 +50,60 @@ class LibensembleApp(ApplicationDefinition): print("Defined LibensembleApp Balsam ApplicationDefinition.") -input_file = ( - "jln_laptop:/Users/jnavarro/Desktop/libensemble" - + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml" -) - libe_job = LibensembleApp.submit( workdir="libe_workflow/libe_processes", - num_nodes=1, - ranks_per_node=5, + num_nodes=LIBE_NODES, + ranks_per_node=LIBE_RANKS, transfers={"input_file": input_file}, ) -print("libEnsemble Job created, synced with Balsam.") +print("libEnsemble Job created, synced with Balsam. Will run on next BatchJob") + + +class RemoteForces(ApplicationDefinition): + site = "jln_theta" + command_template = ( + "/home/jnavarro" + + "/libensemble/libensemble/tests/scaling_tests/forces/forces.x" + + " {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}" + + " > out.txt 2>&1" + ) + + transfers = { + "result": { + "required": False, + "direction": "out", + "local_path": "forces.stat", + "description": "Forces stat file", + "recursive": False, + } + } + + +RemoteForces.sync() + +print("Defined and synced RemoteForces Balsam ApplicationDefinition.") batch = BatchJob.objects.create( site_id=libe_job.site_id, - num_nodes=5, - wall_time_min=60, + num_nodes=BATCH_NUM_NODES, + wall_time_min=BATCH_WALL_CLOCK_TIME, job_mode="mpi", - project="CSC250STMS07", - queue="debug-flat-quad", + project=PROJECT, + queue=QUEUE, ) -print("BatchJob session initialized.") -print("Waiting for all returned forces.stat files...") +print("BatchJob session initialized. All Balsam apps will run in this BatchJob.") + +if TRANSFER_STATFILES: + print("Waiting for all returned forces.stat files...") -while len(glob.glob("./*.stat")) != SIM_MAX: - time.sleep(3) + while len(glob.glob("./*.stat")) != SIM_MAX: + time.sleep(3) -print("All forces.stat files returned. Cancelling BatchJob session.") + print("All forces.stat files returned. Cancelling BatchJob session.") -batch.state = "pending_deletion" -batch.save() + batch.state = "pending_deletion" + batch.save() -print("BatchJob session cancelled. Success!") + print("BatchJob session cancelled. Success!") From 1a066b01f615eef743e6ae351192a6f4e4fb8c2a Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 7 Mar 2022 14:16:54 -0600 Subject: [PATCH 55/93] old balsam executor is now legacy, new balsam executor is now just the balsam_executor --- ...am2_executor.rst => balsam_2_executor.rst} | 10 +- docs/executor/ex_index.rst | 4 +- docs/executor/executor.rst | 4 +- ...xecutor.rst => legacy_balsam_executor.rst} | 10 +- docs/executor/overview.rst | 10 +- docs/overview_usecases.rst | 2 +- docs/platforms/platforms_index.rst | 6 +- docs/running_libE.rst | 2 +- libensemble/executors/__init__.py | 6 +- libensemble/executors/balsam_executor.py | 492 ++++++++++++------ .../executors/legacy_balsam_executor.py | 338 ++++++++++++ libensemble/executors/new_balsam_executor.py | 492 ------------------ .../tests/balsam_tests/reset_balsam_tests.py | 4 +- .../script_test_balsam_hworld.py | 4 +- .../test_executor.py | 4 +- .../test_executor_manager_poll.py | 4 +- .../test_executor_multi.py | 4 +- libensemble/tests/unit_tests/test_executor.py | 8 +- 18 files changed, 700 insertions(+), 704 deletions(-) rename docs/executor/{balsam2_executor.rst => balsam_2_executor.rst} (55%) rename docs/executor/{balsam_executor.rst => legacy_balsam_executor.rst} (67%) create mode 100644 libensemble/executors/legacy_balsam_executor.py delete mode 100644 libensemble/executors/new_balsam_executor.py diff --git a/docs/executor/balsam2_executor.rst b/docs/executor/balsam_2_executor.rst similarity index 55% rename from docs/executor/balsam2_executor.rst rename to docs/executor/balsam_2_executor.rst index 16ca4a917..e4fb86eed 100644 --- a/docs/executor/balsam2_executor.rst +++ b/docs/executor/balsam_2_executor.rst @@ -1,14 +1,14 @@ -Balsam 2 Executor - Remote apps -=============================== +Balsam Executor - Remote apps +============================= -.. automodule:: new_balsam_executor +.. automodule:: balsam_executor :no-undoc-members: -.. autoclass:: NewBalsamExecutor +.. autoclass:: BalsamExecutor :show-inheritance: :members: __init__, register_app, submit_allocation, revoke_allocation, submit -.. autoclass:: NewBalsamTask +.. autoclass:: BalsamTask :show-inheritance: :member-order: bysource :members: poll, wait, kill diff --git a/docs/executor/ex_index.rst b/docs/executor/ex_index.rst index beae83d57..b1967804f 100644 --- a/docs/executor/ex_index.rst +++ b/docs/executor/ex_index.rst @@ -14,5 +14,5 @@ portable interface for running and managing user applications. overview executor mpi_executor - balsam_executor - balsam2_executor + legacy_balsam_executor + balsam_2_executor diff --git a/docs/executor/executor.rst b/docs/executor/executor.rst index 3f7b3de50..9ee52b728 100644 --- a/docs/executor/executor.rst +++ b/docs/executor/executor.rst @@ -13,8 +13,8 @@ See the Executor APIs for optional arguments. :caption: Alternative Executors: mpi_executor - balsam_executor - balsam2_executor + legacy_balsam_executor + balsam_2_executor Executor Class --------------- diff --git a/docs/executor/balsam_executor.rst b/docs/executor/legacy_balsam_executor.rst similarity index 67% rename from docs/executor/balsam_executor.rst rename to docs/executor/legacy_balsam_executor.rst index 4e961ba21..18bd11cc7 100644 --- a/docs/executor/balsam_executor.rst +++ b/docs/executor/legacy_balsam_executor.rst @@ -1,16 +1,16 @@ -Balsam 1 MPI Executor -===================== +Legacy Balsam MPI Executor +========================== -.. automodule:: balsam_executor +.. automodule:: legacy_balsam_executor :no-undoc-members: -.. autoclass:: BalsamMPIExecutor +.. autoclass:: LegacyBalsamMPIExecutor :show-inheritance: :inherited-members: :member-order: bysource :members: __init__, submit, poll, manager_poll, kill, set_kill_mode -.. autoclass:: BalsamTask +.. autoclass:: LegacyBalsamTask :show-inheritance: :member-order: bysource :members: workdir_exists, file_exists_in_workdir, read_file_in_workdir, stdout_exists, read_stdout diff --git a/docs/executor/overview.rst b/docs/executor/overview.rst index 5f591d2c6..a04c5f1cc 100644 --- a/docs/executor/overview.rst +++ b/docs/executor/overview.rst @@ -23,9 +23,9 @@ to an application instance instead of a callable. They feature the ``cancel()``, from the standard. The main ``Executor`` class is an abstract class, inherited by the ``MPIExecutor`` -for direct running of MPI applications, and the ``BalsamMPIExecutor`` -for submitting MPI run requests from a worker running on a compute node to a -Balsam service running on a launch node. This second approach is suitable for +for direct running of MPI applications, and the ``BalsamExecutor`` +for submitting MPI run requests from a worker running on a compute node to the +Balsam service. This second approach is suitable for systems that don't allow submitting MPI applications from compute nodes. Typically, users choose and parameterize their ``Executor`` objects in their @@ -46,8 +46,8 @@ In calling script:: USE_BALSAM = False if USE_BALSAM: - from libensemble.executors.balsam_executor import BalsamMPIExecutor - exctr = BalsamMPIExecutor() + from libensemble.executors.balsam_executor import LegacyBalsamMPIExecutor + exctr = LegacyBalsamMPIExecutor() else: from libensemble.executors.mpi_executor import MPIExecutor exctr = MPIExecutor() diff --git a/docs/overview_usecases.rst b/docs/overview_usecases.rst index 588fe718f..069434cb2 100644 --- a/docs/overview_usecases.rst +++ b/docs/overview_usecases.rst @@ -111,7 +111,7 @@ its capabilities. * **Executor**: The executor can be used within user functions to provide a simple, portable interface for running and managing user tasks (applications). - There are multiple executors including the ``MPIExecutor`` and ``BalsamMPIExecutor``. + There are multiple executors including the ``MPIExecutor`` and ``LegacyBalsamMPIExecutor``. The base ``Executor`` class allows local sub-processing of serial tasks. * **Submit**: Enqueue or indicate that one or more jobs or tasks needs to be diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index 5230f039c..8288f540f 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -84,7 +84,7 @@ Systems with Launch/MOM nodes Some large systems have a 3-tier node setup. That is, they have a separate set of launch nodes (known as MOM nodes on Cray Systems). User batch jobs or interactive sessions run on a launch node. Most such systems supply a special MPI runner which has some application-level scheduling -capability (eg. aprun, jsrun). MPI applications can only be submitted from these nodes. Examples +capability (eg. ``aprun``, ``jsrun``). MPI applications can only be submitted from these nodes. Examples of these systems include: Summit, Sierra and Theta. There are two ways of running libEnsemble on these kind of systems. The first, and simplest, @@ -94,8 +94,8 @@ is inherently centralized. The entire node allocation is available for the worke tasks. To run libEnsemble on the compute nodes of these systems requires an alternative Executor, -such as :doc:`Balsam<../executor/balsam_executor>`, which runs on the -launch nodes and launches tasks submitted by workers. Running libEnsemble on the compute +such as :doc:`Balsam<../executor/balsam_2_executor>`, which maintains a separate service +and launches tasks submitted by workers. Running libEnsemble on the compute nodes is potentially more scalable and will better manage ``sim_f`` and ``gen_f`` functions that contain considerable computational work or I/O. diff --git a/docs/running_libE.rst b/docs/running_libE.rst index fc57a5c4c..6a9851467 100644 --- a/docs/running_libE.rst +++ b/docs/running_libE.rst @@ -41,7 +41,7 @@ Limitations of MPI mode If you are launching MPI applications from workers, then MPI is being nested. This is not supported with Open MPI. This can be overcome by using a proxy launcher -(see :doc:`Balsam`). This nesting does work, however, +(see :doc:`Balsam`). This nesting does work, however, with MPICH and its derivative MPI implementations. It is also unsuitable to use this mode when running on the **launch** nodes of three-tier diff --git a/libensemble/executors/__init__.py b/libensemble/executors/__init__.py index 4ca9b0f5e..9a332c542 100644 --- a/libensemble/executors/__init__.py +++ b/libensemble/executors/__init__.py @@ -7,11 +7,11 @@ try: if pkg_resources.get_distribution('balsam-flow'): if 'BALSAM_DB_PATH' in os.environ: - from libensemble.executors.balsam_executor import BalsamMPIExecutor + from libensemble.executors.legacy_balsam_executor import LegacyBalsamMPIExecutor else: - from libensemble.executors.new_balsam_executor import NewBalsamExecutor + from libensemble.executors.balsam_executor import BalsamExecutor except (ModuleNotFoundError, pkg_resources.DistributionNotFound): # One version of Balsam installed, but not the other pass -__all__ = ['BalsamMPIExecutor', 'Executor', 'MPIExecutor', 'NewBalsamExecutor'] +__all__ = ['LegacyBalsamMPIExecutor', 'Executor', 'MPIExecutor', 'BalsamExecutor'] diff --git a/libensemble/executors/balsam_executor.py b/libensemble/executors/balsam_executor.py index 6bfa6e2a1..1cb5cd9f5 100644 --- a/libensemble/executors/balsam_executor.py +++ b/libensemble/executors/balsam_executor.py @@ -1,22 +1,13 @@ """ -This module launches and controls the running of tasks with Balsam_ 0.5.0. Balsam -is especially useful when running libEnsemble on three-tier systems with intermediate -launch nodes. Typically on such systems, MPI processes are themselves unable -to submit further MPI tasks to the batch scheduler. Therefore when libEnsemble's -workers have been launched in a distributed fashion via MPI, they must communicate -with an intermediate service like Balsam running on the launch nodes. The Balsam -service then reserves compute resources and launches tasks from libEnsemble's workers -that are using the Balsam MPI Executor. +This module launches and controls the running of tasks with Balsam 2, and most +notably can submit tasks from any machine, to any machine running a Balsam site. In order to create a Balsam executor, the calling script should contain :: - exctr = BalsamMPIExecutor() + exctr = NewBalsamExecutor() -The Balsam executor inherits from the MPI executor. See the -:doc:`MPIExecutor` for shared API. Any differences are -shown below. - -.. _Balsam: https://balsam.readthedocs.io/en/master/ +One key difference to consider is that instead of registering paths to apps, +Balsam ApplicationDefinition instances must be registered instead. """ @@ -25,13 +16,17 @@ import time import datetime -from libensemble.resources import mpi_resources -from libensemble.executors.executor import \ - Application, Task, ExecutorException, TimeoutExpired, jassert, STATES -from libensemble.executors.mpi_executor import MPIExecutor +from libensemble.executors.executor import ( + Application, + Task, + ExecutorException, + TimeoutExpired, + jassert, + STATES, +) +from libensemble.executors import Executor -import balsam.launcher.dag as dag -from balsam.core import models +from balsam.api import Job, BatchJob, EventLog logger = logging.getLogger(__name__) # To change logging level for just this module @@ -39,15 +34,25 @@ class BalsamTask(Task): - """Wraps a Balsam Task from the Balsam service + """Wraps a Balsam Job from the Balsam service. - The same attributes and query routines are implemented. + The same attributes and query routines are implemented. Use ``task.process`` + to refer to the matching Balsam Job initialized by the NewBalsamExecutor, + with every Balsam Job method invocable on it. Otherwise, libEnsemble task methods + like ``poll()`` can be used directly. """ - def __init__(self, app=None, app_args=None, workdir=None, - stdout=None, stderr=None, workerid=None): - """Instantiate a new BalsamTask instance. + def __init__( + self, + app=None, + app_args=None, + workdir=None, + stdout=None, + stderr=None, + workerid=None, + ): + """Instantiate a new NewLegacyBalsamTask instance. A new BalsamTask object is created with an id, status and configuration attributes. This will normally be created by the @@ -56,23 +61,13 @@ def __init__(self, app=None, app_args=None, workdir=None, # May want to override workdir with Balsam value when it exists Task.__init__(self, app, app_args, workdir, stdout, stderr, workerid) - def read_file_in_workdir(self, filename): - return self.process.read_file_in_workdir(filename) - - def read_stdout(self): - return self.process.read_file_in_workdir(self.stdout) - - def read_stderr(self): - return self.process.read_file_in_workdir(self.stderr) - def _get_time_since_balsam_submit(self): """Return time since balsam task entered RUNNING state""" - # If wait_on_start then can could calculate runtime same a base executor - # but otherwise that will return time from task submission. Get from Balsam. - - # self.runtime = self.process.runtime_seconds # Only reports at end of run currently - balsam_launch_datetime = self.process.get_state_times().get('RUNNING', None) + event_query = EventLog.objects.filter(job_id=self.process.id, to_state="RUNNING") + if not len(event_query): + return 0 + balsam_launch_datetime = event_query[0].timestamp current_datetime = datetime.datetime.now() if balsam_launch_datetime: return (current_datetime - balsam_launch_datetime).total_seconds() @@ -97,28 +92,30 @@ def _set_complete(self, dry_run=False): self.finished = True if dry_run: self.success = True - self.state = 'FINISHED' + self.state = "FINISHED" else: balsam_state = self.process.state self.workdir = self.workdir or self.process.working_directory self.calc_task_timing() - self.success = (balsam_state == 'JOB_FINISHED') - if balsam_state == 'JOB_FINISHED': - self.state = 'FINISHED' - elif balsam_state == 'PARENT_KILLED': # Not currently used - self.state = 'USER_KILLED' + if balsam_state in [ + "RUN_DONE", + "POSTPROCESSED", + "STAGED_OUT", + "JOB_FINISHED", + ]: + self.success = True + self.state = "FINISHED" elif balsam_state in STATES: # In my states self.state = balsam_state else: - logger.warning("Task finished, but in unrecognized " - "Balsam state {}".format(balsam_state)) - self.state = 'UNKNOWN' + logger.warning("Task finished, but in unrecognized " "Balsam state {}".format(balsam_state)) + self.state = "UNKNOWN" - logger.info("Task {} ended with state {}". - format(self.name, self.state)) + logger.info("Task {} ended with state {}".format(self.name, self.state)) def poll(self): - """Polls and updates the status attributes of the supplied task""" + """Polls and updates the status attributes of the supplied task. Requests + Job information from Balsam service.""" if self.dry_run: return @@ -130,21 +127,30 @@ def poll(self): balsam_state = self.process.state self.runtime = self._get_time_since_balsam_submit() - if balsam_state in models.END_STATES: + if balsam_state in ["RUN_DONE", "POSTPROCESSED", "STAGED_OUT", "JOB_FINISHED"]: self._set_complete() - elif balsam_state in models.ACTIVE_STATES: - self.state = 'RUNNING' + elif balsam_state in ["RUNNING"]: + self.state = "RUNNING" self.workdir = self.workdir or self.process.working_directory - elif (balsam_state in models.PROCESSABLE_STATES or - balsam_state in models.RUNNABLE_STATES): - self.state = 'WAITING' + elif balsam_state in [ + "CREATED", + "AWAITING_PARENTS", + "READY", + "STAGED_IN", + "PREPROCESSED", + ]: + self.state = "WAITING" + + elif balsam_state in ["RUN_ERROR", "RUN_TIMEOUT", "FAILED"]: + self.state = "FAILED" else: raise ExecutorException( "Task state returned from Balsam is not in known list of " - "Balsam states. Task state is {}".format(balsam_state)) + "Balsam states. Task state is {}".format(balsam_state) + ) def wait(self, timeout=None): """Waits on completion of the task or raises TimeoutExpired exception @@ -154,7 +160,7 @@ def wait(self, timeout=None): Parameters ---------- - timeout: + timeout: int Time in seconds after which a TimeoutExpired exception is raised""" if self.dry_run: @@ -166,7 +172,12 @@ def wait(self, timeout=None): # Wait on the task start = time.time() self.process.refresh_from_db() - while self.process.state not in models.END_STATES: + while self.process.state not in [ + "RUN_DONE", + "POSTPROCESSED", + "STAGED_OUT", + "JOB_FINISHED", + ]: time.sleep(0.2) self.process.refresh_from_db() if timeout and time.time() - start > timeout: @@ -176,100 +187,241 @@ def wait(self, timeout=None): self.runtime = self._get_time_since_balsam_submit() self._set_complete() - def kill(self, wait_time=None): - """ Kills or cancels the supplied task """ + def kill(self): + """Cancels the supplied task. Killing is unsupported at this time.""" - dag.kill(self.process) - - # Could have Wait here and check with Balsam its killed - - # but not implemented yet. + self.process.delete() logger.info("Killing task {}".format(self.name)) - self.state = 'USER_KILLED' + self.state = "USER_KILLED" self.finished = True self.calc_task_timing() -class BalsamMPIExecutor(MPIExecutor): - """Inherits from MPIExecutor and wraps the Balsam task management service +class BalsamExecutor(Executor): + """Inherits from Executor and wraps the Balsam service. Via this Executor, + Balsam Jobs can be submitted to Balsam sites, either local or on remote machines. .. note:: Task kills are not configurable in the Balsam executor. """ - def __init__(self, custom_info={}): - """Instantiate a new BalsamMPIExecutor instance. - - A new BalsamMPIExecutor object is created with an application - registry and configuration attributes - """ - if custom_info: - logger.warning("The Balsam executor does not support custom_info - ignoring") + def __init__(self): + """Instantiate a new BalsamExecutor instance.""" - super().__init__(custom_info) + super().__init__() self.workflow_name = "libe_workflow" + self.allocations = [] def serial_setup(self): - """Balsam serial setup includes empyting database and adding applications""" - BalsamMPIExecutor.del_apps() - BalsamMPIExecutor.del_tasks() - - for app in self.apps.values(): - calc_name = app.gname - desc = app.desc - full_path = app.full_path - self.add_app(calc_name, full_path, desc) - - @staticmethod - def del_apps(): - """Deletes all Balsam apps in the libe_app namespace""" - AppDef = models.ApplicationDefinition - - # Some error handling on deletes.... is it internal - for app_type in [Application.prefix]: - deletion_objs = AppDef.objects.filter(name__contains=app_type) - if deletion_objs: - for del_app in deletion_objs.iterator(): - logger.debug("Deleting app {}".format(del_app.name)) - deletion_objs.delete() - - @staticmethod - def del_tasks(): - """Deletes all Balsam tasks """ - for app_type in [Task.prefix]: - deletion_objs = models.BalsamJob.objects.filter( - name__contains=app_type) - if deletion_objs: - for del_task in deletion_objs.iterator(): - logger.debug("Deleting task {}".format(del_task.name)) - deletion_objs.delete() - - @staticmethod - def add_app(name, exepath, desc): - """ Add application to Balsam database """ - AppDef = models.ApplicationDefinition - app = AppDef() - app.name = name - app.executable = exepath - app.description = desc - # app.default_preprocess = '' # optional - # app.default_postprocess = '' # optional - app.save() - logger.debug("Added App {}".format(app.name)) + """Balsam serial setup includes emptying database and adding applications""" + pass + + def add_app(self, name, site, exepath, desc): + """Sync application with balsam service""" + pass + + def register_app(self, BalsamApp, app_name, calc_type=None, desc=None): + """Registers a Balsam ApplicationDefinition to libEnsemble. This class + instance *must* have a ``site`` and ``command_template`` specified. See + the Balsam docs for information on other optional fields. + + Parameters + ---------- + + BalsamApp: ApplicationDefinition object + A Balsam ApplicationDefinition instance. + + app_name: String, optional + Name to identify this application. + + calc_type: String, optional + Calculation type: Set this application as the default 'sim' + or 'gen' function. + + desc: String, optional + Description of this application + + """ + if not app_name: + app_name = BalsamApp.command_template.split(" ")[0] + self.apps[app_name] = Application(" ", app_name, calc_type, desc, BalsamApp) + + # Default sim/gen apps will be deprecated. Just use names. + if calc_type is not None: + jassert( + calc_type in self.default_apps, + "Unrecognized calculation type", + calc_type, + ) + self.default_apps[calc_type] = self.apps[app_name] + + def submit_allocation( + self, + site_id, + num_nodes, + wall_time_min, + job_mode="mpi", + queue="local", + project="local", + ): + """ + Submits a Balsam ``BatchJob`` machine allocation request to Balsam. + Corresponding Balsam applications with a matching site can be submitted to + this allocation. + + Parameters + ---------- + + site_id: int + The corresponding site_id for a Balsam site. Retrieve via ``balsam site ls`` + + num_nodes: int + The number of nodes to request from a machine with a running Balsam site + + wall_time_min: int + The number of walltime minutes to request for the BatchJob allocation + + job_mode: String, optional + Either "serial" or "mpi". Default: "mpi" + + queue: String, optional + Specifies the queue from which the BatchJob should request nodes. Default: "local" + + project: String, optional + Specifies the project that should be charged for the requested hours. Default: "local" + + Returns + ------- + + The corresponding ``BatchJob`` object. + """ + + allocation = BatchJob.objects.create( + site_id=site_id, + num_nodes=num_nodes, + wall_time_min=wall_time_min, + job_mode=job_mode, + queue=queue, + project=project, + ) + + self.allocations.append(allocation) + + logger.info( + "Submitted Batch allocation to site {}: " + "nodes {} queue {} project {}".format(site_id, num_nodes, queue, project) + ) + + return allocation + + def revoke_allocation(self, allocation): + """ + Terminates a Balsam BatchJob machine allocation remotely. Balsam apps should + no longer be submitted to this allocation. Best to run after libEnsemble + completes, or after this BatchJob is no longer needed. Helps save machine time. + + Parameters + ---------- + + allocation: BatchJob object + a BatchJob with a corresponding machine allocation that should be cancelled. + """ + allocation.refresh_from_db() + + while not allocation.scheduler_id: + time.sleep(1) + allocation.refresh_from_db() + + batchjob = BatchJob.objects.get(scheduler_id=allocation.scheduler_id) + batchjob.state = "pending_deletion" + batchjob.save() def set_resources(self, resources): self.resources = resources - def submit(self, calc_type=None, app_name=None, num_procs=None, - num_nodes=None, procs_per_node=None, machinefile=None, - app_args=None, stdout=None, stderr=None, stage_inout=None, - hyperthreads=False, dry_run=False, wait_on_start=False, - extra_args=''): - """Creates a new task, and either executes or schedules to execute - in the executor + def submit( + self, + calc_type=None, + app_name=None, + app_args=None, + num_procs=None, + num_nodes=None, + procs_per_node=None, + max_tasks_per_node=None, + machinefile=None, + gpus_per_rank=0, + transfers={}, + workdir="", + dry_run=False, + wait_on_start=False, + extra_args={}, + ): + """Initializes and submits a Balsam Job based on a registered ApplicationDefinition + and requested resource parameters. A corresponding libEnsemble Task object + is created and returned. + + calc_type: String, optional + The calculation type: 'sim' or 'gen' + Only used if app_name is not supplied. Uses default sim or gen application. + + app_name: String, optional + The application name. Must be supplied if calc_type is not. + + app_args: dict + A dictionary of options that correspond to fields to template in the + ApplicationDefinition's ``command_template`` field. + + num_procs: int, optional + The total number of MPI ranks on which to submit the task + + num_nodes: int, optional + The number of nodes on which to submit the task + + procs_per_node: int, optional + The processes per node for this task + + max_tasks_per_node: int + Instructs Balsam to schedule at most this many Jobs per node. + + machinefile: string, optional + Name of a machinefile for this task to use. Unused by Balsam + + gpus_per_rank: int + Number of GPUs to reserve for each MPI rank + + transfers: dict + A Job-specific Balsam transfers dictionary that corresponds with an + ApplicationDefinition ``transfers`` field. See the Balsam docs for + more information. + + workdir: String + Specifies as name for the Job's output directory within the Balsam site's + data directory. Default: libe_workflow + + dry_run: boolean, optional + Whether this is a dry_run - no task will be launched; instead + runline is printed to logger (at INFO level) + + wait_on_start: boolean, optional + Whether to block, and wait for task to be polled as RUNNING (or other + active/end state) before continuing + + extra_args: dict + Additional arguments to supply to MPI runner. + + Returns + ------- + + task: obj: Task + The launched task object + + Note that since Balsam Jobs are often sent to entirely different machines + than where libEnsemble is running, that how libEnsemble's resource manager + has divided local resources among workers doesn't impact what resources + can be requested for a Balsam Job running on an entirely different machine. - The created task object is returned. """ if app_name is not None: @@ -279,59 +431,57 @@ def submit(self, calc_type=None, app_name=None, num_procs=None, else: raise ExecutorException("Either app_name or calc_type must be set") + if len(workdir): + workdir = os.path.join(self.workflow_name, workdir) + else: + workdir = self.workflow_name + # Specific to this class if machinefile is not None: logger.warning("machinefile arg ignored - not supported in Balsam") - jassert(num_procs or num_nodes or procs_per_node, - "No procs/nodes provided - aborting") - - num_procs, num_nodes, procs_per_node = \ - mpi_resources.task_partition(num_procs, num_nodes, procs_per_node) - - if stdout is not None or stderr is not None: - logger.warning("Balsam does not currently accept a stdout " - "or stderr name - ignoring") - stdout = None - stderr = None - - # Will be possible to override with arg when implemented - # (or can have option to let Balsam assign) - default_workdir = os.getcwd() - task = BalsamTask(app, app_args, default_workdir, - stdout, stderr, self.workerID) - - add_task_args = {'name': task.name, - 'workflow': self.workflow_name, - 'user_workdir': default_workdir, - 'application': app.gname, - 'args': task.app_args, - 'num_nodes': num_nodes, - 'procs_per_node': procs_per_node, - 'mpi_flags': extra_args} - - if stage_inout is not None: - # For now hardcode staging - for testing - add_task_args['stage_in_url'] = "local:" + stage_inout + "/*" - add_task_args['stage_out_url'] = "local:" + stage_inout - add_task_args['stage_out_files'] = "*.out" + jassert( + num_procs or num_nodes or procs_per_node, + "No procs/nodes provided - aborting", + ) + + task = BalsamTask(app, app_args, workdir, None, None, self.workerID) if dry_run: task.dry_run = True - logger.info('Test (No submit) Runline: {}'.format(' '.join(add_task_args))) + logger.info("Test (No submit) Balsam app {}".format(app_name)) task._set_complete(dry_run=True) else: - task.process = dag.add_job(**add_task_args) - - if (wait_on_start): + App = app.pyobj + + try: + App.sync() # if App source-code available, send to Balsam service + except OSError: + pass # App retrieved from Balsam service, assume no access to source-code + + task.process = Job( + app_id=App, + workdir=workdir, + parameters=app_args, + num_nodes=num_nodes, + ranks_per_node=procs_per_node, + launch_params=extra_args, + gpus_per_rank=gpus_per_rank, + node_packing_count=max_tasks_per_node, + transfers=transfers, + ) + + task.process.save() + + if wait_on_start: self._wait_on_start(task) if not task.timer.timing: task.timer.start() task.submit_time = task.timer.tstart # Time not date - may not need if using timer. - logger.info("Added task to Balsam database {}: " - "nodes {} ppn {}". - format(task.name, num_nodes, procs_per_node)) + logger.info( + "Submitted Balsam App to site {}: " "nodes {} ppn {}".format(App.site, num_nodes, procs_per_node) + ) # task.workdir = task.process.working_directory # Might not be set yet! self.list_of_tasks.append(task) diff --git a/libensemble/executors/legacy_balsam_executor.py b/libensemble/executors/legacy_balsam_executor.py new file mode 100644 index 000000000..cdfcf6271 --- /dev/null +++ b/libensemble/executors/legacy_balsam_executor.py @@ -0,0 +1,338 @@ +""" +This module launches and controls the running of tasks with Balsam_ versions up to 0.5.0. Balsam +is especially useful when running libEnsemble on three-tier systems with intermediate +launch nodes. Typically on such systems, MPI processes are themselves unable +to submit further MPI tasks to the batch scheduler. Therefore when libEnsemble's +workers have been launched in a distributed fashion via MPI, they must communicate +with an intermediate service like Balsam running on the launch nodes. The Balsam +service then reserves compute resources and launches tasks from libEnsemble's workers +that are using the Balsam MPI Executor. + +In order to create a Balsam executor, the calling script should contain :: + + exctr = LegacyBalsamMPIExecutor() + +The Balsam executor inherits from the MPI executor. See the +:doc:`MPIExecutor` for shared API. Any differences are +shown below. + +.. _Balsam: https://balsam.readthedocs.io/en/master/ + +""" + +import os +import logging +import time +import datetime + +from libensemble.resources import mpi_resources +from libensemble.executors.executor import \ + Application, Task, ExecutorException, TimeoutExpired, jassert, STATES +from libensemble.executors.mpi_executor import MPIExecutor + +import balsam.launcher.dag as dag +from balsam.core import models + +logger = logging.getLogger(__name__) +# To change logging level for just this module +# logger.setLevel(logging.DEBUG) + + +class LegacyBalsamTask(Task): + """Wraps a Balsam Task from the Balsam service + + The same attributes and query routines are implemented. + + """ + + def __init__(self, app=None, app_args=None, workdir=None, + stdout=None, stderr=None, workerid=None): + """Instantiate a new LegacyBalsamTask instance. + + A new LegacyBalsamTask object is created with an id, status and + configuration attributes. This will normally be created by the + executor on a submission. + """ + # May want to override workdir with Balsam value when it exists + Task.__init__(self, app, app_args, workdir, stdout, stderr, workerid) + + def read_file_in_workdir(self, filename): + return self.process.read_file_in_workdir(filename) + + def read_stdout(self): + return self.process.read_file_in_workdir(self.stdout) + + def read_stderr(self): + return self.process.read_file_in_workdir(self.stderr) + + def _get_time_since_balsam_submit(self): + """Return time since balsam task entered RUNNING state""" + + # If wait_on_start then can could calculate runtime same a base executor + # but otherwise that will return time from task submission. Get from Balsam. + + # self.runtime = self.process.runtime_seconds # Only reports at end of run currently + balsam_launch_datetime = self.process.get_state_times().get('RUNNING', None) + current_datetime = datetime.datetime.now() + if balsam_launch_datetime: + return (current_datetime - balsam_launch_datetime).total_seconds() + else: + return 0 + + def calc_task_timing(self): + """Calculate timing information for this task""" + + # Get runtime from Balsam + self.runtime = self._get_time_since_balsam_submit() + + if self.submit_time is None: + logger.warning("Cannot calc task total_time - submit time not set") + return + + if self.total_time is None: + self.total_time = time.time() - self.submit_time + + def _set_complete(self, dry_run=False): + """Set task as complete""" + self.finished = True + if dry_run: + self.success = True + self.state = 'FINISHED' + else: + balsam_state = self.process.state + self.workdir = self.workdir or self.process.working_directory + self.calc_task_timing() + self.success = (balsam_state == 'JOB_FINISHED') + if balsam_state == 'JOB_FINISHED': + self.state = 'FINISHED' + elif balsam_state == 'PARENT_KILLED': # Not currently used + self.state = 'USER_KILLED' + elif balsam_state in STATES: # In my states + self.state = balsam_state + else: + logger.warning("Task finished, but in unrecognized " + "Balsam state {}".format(balsam_state)) + self.state = 'UNKNOWN' + + logger.info("Task {} ended with state {}". + format(self.name, self.state)) + + def poll(self): + """Polls and updates the status attributes of the supplied task""" + if self.dry_run: + return + + if not self._check_poll(): + return + + # Get current state of tasks from Balsam database + self.process.refresh_from_db() + balsam_state = self.process.state + self.runtime = self._get_time_since_balsam_submit() + + if balsam_state in models.END_STATES: + self._set_complete() + + elif balsam_state in models.ACTIVE_STATES: + self.state = 'RUNNING' + self.workdir = self.workdir or self.process.working_directory + + elif (balsam_state in models.PROCESSABLE_STATES or + balsam_state in models.RUNNABLE_STATES): + self.state = 'WAITING' + + else: + raise ExecutorException( + "Task state returned from Balsam is not in known list of " + "Balsam states. Task state is {}".format(balsam_state)) + + def wait(self, timeout=None): + """Waits on completion of the task or raises TimeoutExpired exception + + Status attributes of task are updated on completion. + + Parameters + ---------- + + timeout: + Time in seconds after which a TimeoutExpired exception is raised""" + + if self.dry_run: + return + + if not self._check_poll(): + return + + # Wait on the task + start = time.time() + self.process.refresh_from_db() + while self.process.state not in models.END_STATES: + time.sleep(0.2) + self.process.refresh_from_db() + if timeout and time.time() - start > timeout: + self.runtime = self._get_time_since_balsam_submit() + raise TimeoutExpired(self.name, timeout) + + self.runtime = self._get_time_since_balsam_submit() + self._set_complete() + + def kill(self, wait_time=None): + """ Kills or cancels the supplied task """ + + dag.kill(self.process) + + # Could have Wait here and check with Balsam its killed - + # but not implemented yet. + + logger.info("Killing task {}".format(self.name)) + self.state = 'USER_KILLED' + self.finished = True + self.calc_task_timing() + + +class LegacyBalsamMPIExecutor(MPIExecutor): + """Inherits from MPIExecutor and wraps the Balsam task management service + + .. note:: Task kills are not configurable in the Balsam executor. + + """ + def __init__(self, custom_info={}): + """Instantiate a new LegacyBalsamMPIExecutor instance. + + A new LegacyBalsamMPIExecutor object is created with an application + registry and configuration attributes + """ + + if custom_info: + logger.warning("The Balsam executor does not support custom_info - ignoring") + + super().__init__(custom_info) + + self.workflow_name = "libe_workflow" + + def serial_setup(self): + """Balsam serial setup includes empyting database and adding applications""" + LegacyBalsamMPIExecutor.del_apps() + LegacyBalsamMPIExecutor.del_tasks() + + for app in self.apps.values(): + calc_name = app.gname + desc = app.desc + full_path = app.full_path + self.add_app(calc_name, full_path, desc) + + @staticmethod + def del_apps(): + """Deletes all Balsam apps in the libe_app namespace""" + AppDef = models.ApplicationDefinition + + # Some error handling on deletes.... is it internal + for app_type in [Application.prefix]: + deletion_objs = AppDef.objects.filter(name__contains=app_type) + if deletion_objs: + for del_app in deletion_objs.iterator(): + logger.debug("Deleting app {}".format(del_app.name)) + deletion_objs.delete() + + @staticmethod + def del_tasks(): + """Deletes all Balsam tasks """ + for app_type in [Task.prefix]: + deletion_objs = models.BalsamJob.objects.filter( + name__contains=app_type) + if deletion_objs: + for del_task in deletion_objs.iterator(): + logger.debug("Deleting task {}".format(del_task.name)) + deletion_objs.delete() + + @staticmethod + def add_app(name, exepath, desc): + """ Add application to Balsam database """ + AppDef = models.ApplicationDefinition + app = AppDef() + app.name = name + app.executable = exepath + app.description = desc + # app.default_preprocess = '' # optional + # app.default_postprocess = '' # optional + app.save() + logger.debug("Added App {}".format(app.name)) + + def set_resources(self, resources): + self.resources = resources + + def submit(self, calc_type=None, app_name=None, num_procs=None, + num_nodes=None, procs_per_node=None, machinefile=None, + app_args=None, stdout=None, stderr=None, stage_inout=None, + hyperthreads=False, dry_run=False, wait_on_start=False, + extra_args=''): + """Creates a new task, and either executes or schedules to execute + in the executor + + The created task object is returned. + """ + + if app_name is not None: + app = self.get_app(app_name) + elif calc_type is not None: + app = self.default_app(calc_type) + else: + raise ExecutorException("Either app_name or calc_type must be set") + + # Specific to this class + if machinefile is not None: + logger.warning("machinefile arg ignored - not supported in Balsam") + jassert(num_procs or num_nodes or procs_per_node, + "No procs/nodes provided - aborting") + + num_procs, num_nodes, procs_per_node = \ + mpi_resources.task_partition(num_procs, num_nodes, procs_per_node) + + if stdout is not None or stderr is not None: + logger.warning("Balsam does not currently accept a stdout " + "or stderr name - ignoring") + stdout = None + stderr = None + + # Will be possible to override with arg when implemented + # (or can have option to let Balsam assign) + default_workdir = os.getcwd() + task = LegacyBalsamTask(app, app_args, default_workdir, + stdout, stderr, self.workerID) + + add_task_args = {'name': task.name, + 'workflow': self.workflow_name, + 'user_workdir': default_workdir, + 'application': app.gname, + 'args': task.app_args, + 'num_nodes': num_nodes, + 'procs_per_node': procs_per_node, + 'mpi_flags': extra_args} + + if stage_inout is not None: + # For now hardcode staging - for testing + add_task_args['stage_in_url'] = "local:" + stage_inout + "/*" + add_task_args['stage_out_url'] = "local:" + stage_inout + add_task_args['stage_out_files'] = "*.out" + + if dry_run: + task.dry_run = True + logger.info('Test (No submit) Runline: {}'.format(' '.join(add_task_args))) + task._set_complete(dry_run=True) + else: + task.process = dag.add_job(**add_task_args) + + if (wait_on_start): + self._wait_on_start(task) + + if not task.timer.timing: + task.timer.start() + task.submit_time = task.timer.tstart # Time not date - may not need if using timer. + + logger.info("Added task to Balsam database {}: " + "nodes {} ppn {}". + format(task.name, num_nodes, procs_per_node)) + + # task.workdir = task.process.working_directory # Might not be set yet! + self.list_of_tasks.append(task) + return task diff --git a/libensemble/executors/new_balsam_executor.py b/libensemble/executors/new_balsam_executor.py deleted file mode 100644 index 97dfe09e7..000000000 --- a/libensemble/executors/new_balsam_executor.py +++ /dev/null @@ -1,492 +0,0 @@ -""" -This module launches and controls the running of tasks with Balsam 2, and most -notably can submit tasks from any machine, to any machine running a Balsam site. - -In order to create a Balsam executor, the calling script should contain :: - - exctr = NewBalsamExecutor() - -One key difference to consider is that instead of registering paths to apps, -Balsam ApplicationDefinition instances must be registered instead. - -""" - -import os -import logging -import time -import datetime - -from libensemble.executors.executor import ( - Application, - Task, - ExecutorException, - TimeoutExpired, - jassert, - STATES, -) -from libensemble.executors import Executor - -from balsam.api import Job, BatchJob, EventLog - -logger = logging.getLogger(__name__) -# To change logging level for just this module -# logger.setLevel(logging.DEBUG) - - -class NewBalsamTask(Task): - """Wraps a Balsam Job from the Balsam service. - - The same attributes and query routines are implemented. Use ``task.process`` - to refer to the matching Balsam Job initialized by the NewBalsamExecutor, - with every Balsam Job method invocable on it. Otherwise, libEnsemble task methods - like ``poll()`` can be used directly. - - """ - - def __init__( - self, - app=None, - app_args=None, - workdir=None, - stdout=None, - stderr=None, - workerid=None, - ): - """Instantiate a new NewBalsamTask instance. - - A new NewBalsamTask object is created with an id, status and - configuration attributes. This will normally be created by the - executor on a submission. - """ - # May want to override workdir with Balsam value when it exists - Task.__init__(self, app, app_args, workdir, stdout, stderr, workerid) - - def _get_time_since_balsam_submit(self): - """Return time since balsam task entered RUNNING state""" - - event_query = EventLog.objects.filter(job_id=self.process.id, to_state="RUNNING") - if not len(event_query): - return 0 - balsam_launch_datetime = event_query[0].timestamp - current_datetime = datetime.datetime.now() - if balsam_launch_datetime: - return (current_datetime - balsam_launch_datetime).total_seconds() - else: - return 0 - - def calc_task_timing(self): - """Calculate timing information for this task""" - - # Get runtime from Balsam - self.runtime = self._get_time_since_balsam_submit() - - if self.submit_time is None: - logger.warning("Cannot calc task total_time - submit time not set") - return - - if self.total_time is None: - self.total_time = time.time() - self.submit_time - - def _set_complete(self, dry_run=False): - """Set task as complete""" - self.finished = True - if dry_run: - self.success = True - self.state = "FINISHED" - else: - balsam_state = self.process.state - self.workdir = self.workdir or self.process.working_directory - self.calc_task_timing() - if balsam_state in [ - "RUN_DONE", - "POSTPROCESSED", - "STAGED_OUT", - "JOB_FINISHED", - ]: - self.success = True - self.state = "FINISHED" - elif balsam_state in STATES: # In my states - self.state = balsam_state - else: - logger.warning("Task finished, but in unrecognized " "Balsam state {}".format(balsam_state)) - self.state = "UNKNOWN" - - logger.info("Task {} ended with state {}".format(self.name, self.state)) - - def poll(self): - """Polls and updates the status attributes of the supplied task. Requests - Job information from Balsam service.""" - if self.dry_run: - return - - if not self._check_poll(): - return - - # Get current state of tasks from Balsam database - self.process.refresh_from_db() - balsam_state = self.process.state - self.runtime = self._get_time_since_balsam_submit() - - if balsam_state in ["RUN_DONE", "POSTPROCESSED", "STAGED_OUT", "JOB_FINISHED"]: - self._set_complete() - - elif balsam_state in ["RUNNING"]: - self.state = "RUNNING" - self.workdir = self.workdir or self.process.working_directory - - elif balsam_state in [ - "CREATED", - "AWAITING_PARENTS", - "READY", - "STAGED_IN", - "PREPROCESSED", - ]: - self.state = "WAITING" - - elif balsam_state in ["RUN_ERROR", "RUN_TIMEOUT", "FAILED"]: - self.state = "FAILED" - - else: - raise ExecutorException( - "Task state returned from Balsam is not in known list of " - "Balsam states. Task state is {}".format(balsam_state) - ) - - def wait(self, timeout=None): - """Waits on completion of the task or raises TimeoutExpired exception - - Status attributes of task are updated on completion. - - Parameters - ---------- - - timeout: int - Time in seconds after which a TimeoutExpired exception is raised""" - - if self.dry_run: - return - - if not self._check_poll(): - return - - # Wait on the task - start = time.time() - self.process.refresh_from_db() - while self.process.state not in [ - "RUN_DONE", - "POSTPROCESSED", - "STAGED_OUT", - "JOB_FINISHED", - ]: - time.sleep(0.2) - self.process.refresh_from_db() - if timeout and time.time() - start > timeout: - self.runtime = self._get_time_since_balsam_submit() - raise TimeoutExpired(self.name, timeout) - - self.runtime = self._get_time_since_balsam_submit() - self._set_complete() - - def kill(self): - """Cancels the supplied task. Killing is unsupported at this time.""" - - self.process.delete() - - logger.info("Killing task {}".format(self.name)) - self.state = "USER_KILLED" - self.finished = True - self.calc_task_timing() - - -class NewBalsamExecutor(Executor): - """Inherits from MPIExecutor and wraps the Balsam service. Via this Executor, - Balsam Jobs can be submitted to Balsam sites, either local or on remote machines. - - .. note:: Task kills are not configurable in the Balsam executor. - - """ - - def __init__(self): - """Instantiate a new BalsamMPIExecutor instance. - - A new BalsamMPIExecutor object is created with an application - registry and configuration attributes - """ - - super().__init__() - - self.workflow_name = "libe_workflow" - self.allocations = [] - - def serial_setup(self): - """Balsam serial setup includes emptying database and adding applications""" - pass - - def add_app(self, name, site, exepath, desc): - """Sync application with balsam service""" - pass - - def register_app(self, BalsamApp, app_name, calc_type=None, desc=None): - """Registers a Balsam ApplicationDefinition to libEnsemble. This class - instance *must* have a ``site`` and ``command_template`` specified. See - the Balsam docs for information on other optional fields. - - Parameters - ---------- - - BalsamApp: ApplicationDefinition object - A Balsam ApplicationDefinition instance. - - app_name: String, optional - Name to identify this application. - - calc_type: String, optional - Calculation type: Set this application as the default 'sim' - or 'gen' function. - - desc: String, optional - Description of this application - - """ - if not app_name: - app_name = BalsamApp.command_template.split(" ")[0] - self.apps[app_name] = Application(" ", app_name, calc_type, desc, BalsamApp) - - # Default sim/gen apps will be deprecated. Just use names. - if calc_type is not None: - jassert( - calc_type in self.default_apps, - "Unrecognized calculation type", - calc_type, - ) - self.default_apps[calc_type] = self.apps[app_name] - - def submit_allocation( - self, - site_id, - num_nodes, - wall_time_min, - job_mode="mpi", - queue="local", - project="local", - ): - """ - Submits a Balsam ``BatchJob`` machine allocation request to Balsam. - Corresponding Balsam applications with a matching site can be submitted to - this allocation. - - Parameters - ---------- - - site_id: int - The corresponding site_id for a Balsam site. Retrieve via ``balsam site ls`` - - num_nodes: int - The number of nodes to request from a machine with a running Balsam site - - wall_time_min: int - The number of walltime minutes to request for the BatchJob allocation - - job_mode: String, optional - Either "serial" or "mpi". Default: "mpi" - - queue: String, optional - Specifies the queue from which the BatchJob should request nodes. Default: "local" - - project: String, optional - Specifies the project that should be charged for the requested hours. Default: "local" - - Returns - ------- - - The corresponding ``BatchJob`` object. - """ - - allocation = BatchJob.objects.create( - site_id=site_id, - num_nodes=num_nodes, - wall_time_min=wall_time_min, - job_mode=job_mode, - queue=queue, - project=project, - ) - - self.allocations.append(allocation) - - logger.info( - "Submitted Batch allocation to site {}: " - "nodes {} queue {} project {}".format(site_id, num_nodes, queue, project) - ) - - return allocation - - def revoke_allocation(self, allocation): - """ - Terminates a Balsam BatchJob machine allocation remotely. Balsam apps should - no longer be submitted to this allocation. Best to run after libEnsemble - completes, or after this BatchJob is no longer needed. Helps save machine time. - - Parameters - ---------- - - allocation: BatchJob object - a BatchJob with a corresponding machine allocation that should be cancelled. - """ - allocation.refresh_from_db() - - while not allocation.scheduler_id: - time.sleep(1) - allocation.refresh_from_db() - - batchjob = BatchJob.objects.get(scheduler_id=allocation.scheduler_id) - batchjob.state = "pending_deletion" - batchjob.save() - - def set_resources(self, resources): - self.resources = resources - - def submit( - self, - calc_type=None, - app_name=None, - app_args=None, - num_procs=None, - num_nodes=None, - procs_per_node=None, - max_tasks_per_node=None, - machinefile=None, - gpus_per_rank=0, - transfers={}, - workdir="", - dry_run=False, - wait_on_start=False, - extra_args={}, - ): - """Initializes and submits a Balsam Job based on a registered ApplicationDefinition - and requested resource parameters. A corresponding libEnsemble Task object - is created and returned. - - calc_type: String, optional - The calculation type: 'sim' or 'gen' - Only used if app_name is not supplied. Uses default sim or gen application. - - app_name: String, optional - The application name. Must be supplied if calc_type is not. - - app_args: dict - A dictionary of options that correspond to fields to template in the - ApplicationDefinition's ``command_template`` field. - - num_procs: int, optional - The total number of MPI ranks on which to submit the task - - num_nodes: int, optional - The number of nodes on which to submit the task - - procs_per_node: int, optional - The processes per node for this task - - max_tasks_per_node: int - Instructs Balsam to schedule at most this many Jobs per node. - - machinefile: string, optional - Name of a machinefile for this task to use. Unused by Balsam - - gpus_per_rank: int - Number of GPUs to reserve for each MPI rank - - transfers: dict - A Job-specific Balsam transfers dictionary that corresponds with an - ApplicationDefinition ``transfers`` field. See the Balsam docs for - more information. - - workdir: String - Specifies as name for the Job's output directory within the Balsam site's - data directory. Default: libe_workflow - - dry_run: boolean, optional - Whether this is a dry_run - no task will be launched; instead - runline is printed to logger (at INFO level) - - wait_on_start: boolean, optional - Whether to block, and wait for task to be polled as RUNNING (or other - active/end state) before continuing - - extra_args: dict - Additional arguments to supply to MPI runner. - - Returns - ------- - - task: obj: Task - The launched task object - - Note that since Balsam Jobs are often sent to entirely different machines - than where libEnsemble is running, that how libEnsemble's resource manager - has divided local resources among workers doesn't impact what resources - can be requested for a Balsam Job running on an entirely different machine. - - """ - - if app_name is not None: - app = self.get_app(app_name) - elif calc_type is not None: - app = self.default_app(calc_type) - else: - raise ExecutorException("Either app_name or calc_type must be set") - - if len(workdir): - workdir = os.path.join(self.workflow_name, workdir) - else: - workdir = self.workflow_name - - # Specific to this class - if machinefile is not None: - logger.warning("machinefile arg ignored - not supported in Balsam") - jassert( - num_procs or num_nodes or procs_per_node, - "No procs/nodes provided - aborting", - ) - - task = NewBalsamTask(app, app_args, workdir, None, None, self.workerID) - - if dry_run: - task.dry_run = True - logger.info("Test (No submit) Balsam app {}".format(app_name)) - task._set_complete(dry_run=True) - else: - App = app.pyobj - - try: - App.sync() # if App source-code available, send to Balsam service - except OSError: - pass # App retrieved from Balsam service, assume no access to source-code - - task.process = Job( - app_id=App, - workdir=workdir, - parameters=app_args, - num_nodes=num_nodes, - ranks_per_node=procs_per_node, - launch_params=extra_args, - gpus_per_rank=gpus_per_rank, - node_packing_count=max_tasks_per_node, - transfers=transfers, - ) - - task.process.save() - - if wait_on_start: - self._wait_on_start(task) - - if not task.timer.timing: - task.timer.start() - task.submit_time = task.timer.tstart # Time not date - may not need if using timer. - - logger.info( - "Submitted Balsam App to site {}: " "nodes {} ppn {}".format(App.site, num_nodes, procs_per_node) - ) - - # task.workdir = task.process.working_directory # Might not be set yet! - self.list_of_tasks.append(task) - return task diff --git a/libensemble/tests/balsam_tests/reset_balsam_tests.py b/libensemble/tests/balsam_tests/reset_balsam_tests.py index 80e34fc29..01965fc15 100755 --- a/libensemble/tests/balsam_tests/reset_balsam_tests.py +++ b/libensemble/tests/balsam_tests/reset_balsam_tests.py @@ -2,8 +2,8 @@ import balsam.launcher.dag as dag -dag.BalsamTask.objects.filter(name__contains='outfile').delete() +dag.LegacyBalsamTask.objects.filter(name__contains='outfile').delete() -for job in dag.BalsamTask.objects.filter(name__contains='job_test_balsam'): +for job in dag.LegacyBalsamTask.objects.filter(name__contains='job_test_balsam'): job.update_state('CREATED') job.save() diff --git a/libensemble/tests/regression_tests/script_test_balsam_hworld.py b/libensemble/tests/regression_tests/script_test_balsam_hworld.py index 39ccf36f9..0b5b9488f 100644 --- a/libensemble/tests/regression_tests/script_test_balsam_hworld.py +++ b/libensemble/tests/regression_tests/script_test_balsam_hworld.py @@ -7,7 +7,7 @@ import mpi4py from mpi4py import MPI -from libensemble.executors.balsam_executor import BalsamMPIExecutor +from libensemble.executors.balsam_executor import LegacyBalsamMPIExecutor from libensemble.message_numbers import WORKER_DONE, WORKER_KILL_ON_ERR, WORKER_KILL_ON_TIMEOUT, TASK_FAILED from libensemble.libE import libE from libensemble.sim_funcs.executor_hworld import executor_hworld @@ -32,7 +32,7 @@ sim_app = './my_simtask.x' sim_app2 = six_hump_camel.__file__ -exctr = BalsamMPIExecutor() +exctr = LegacyBalsamMPIExecutor() exctr.register_app(full_path=sim_app, calc_type='sim') # Default 'sim' app - backward compatible exctr.register_app(full_path=sim_app2, app_name='six_hump_camel') # Named app exctr.register_app(full_path=sim_app2, app_name='sim_hump_camel_dry_run') diff --git a/libensemble/tests/standalone_executor_tests/test_executor.py b/libensemble/tests/standalone_executor_tests/test_executor.py index d8056c386..ac232d6ee 100644 --- a/libensemble/tests/standalone_executor_tests/test_executor.py +++ b/libensemble/tests/standalone_executor_tests/test_executor.py @@ -31,8 +31,8 @@ def build_simfunc(): # Create and add exes to registry if USE_BALSAM: - from libensemble.executors.balsam_executor import BalsamMPIExecutor - exctr = BalsamMPIExecutor() + from libensemble.executors.balsam_executor import LegacyBalsamMPIExecutor + exctr = LegacyBalsamMPIExecutor() else: from libensemble.executors.mpi_executor import MPIExecutor exctr = MPIExecutor() diff --git a/libensemble/tests/standalone_executor_tests/test_executor_manager_poll.py b/libensemble/tests/standalone_executor_tests/test_executor_manager_poll.py index 4a90b4c1f..65e9d6e24 100644 --- a/libensemble/tests/standalone_executor_tests/test_executor_manager_poll.py +++ b/libensemble/tests/standalone_executor_tests/test_executor_manager_poll.py @@ -40,8 +40,8 @@ def build_simfunc(): # Create and add exes to registry if USE_BALSAM: - from libensemble.executors.balsam_executor import BalsamMPIExecutor - exctr = BalsamMPIExecutor() + from libensemble.executors.balsam_executor import LegacyBalsamMPIExecutor + exctr = LegacyBalsamMPIExecutor() else: from libensemble.executors.mpi_executor import MPIExecutor exctr = MPIExecutor() diff --git a/libensemble/tests/standalone_executor_tests/test_executor_multi.py b/libensemble/tests/standalone_executor_tests/test_executor_multi.py index db1243281..06ff19de0 100644 --- a/libensemble/tests/standalone_executor_tests/test_executor_multi.py +++ b/libensemble/tests/standalone_executor_tests/test_executor_multi.py @@ -34,8 +34,8 @@ def build_simfunc(): # Create and add exes to registry if USE_BALSAM: - from libensemble.baslam_executor import BalsamMPIExecutor - exctr = BalsamMPIExecutor() + from libensemble.baslam_executor import LegacyBalsamMPIExecutor + exctr = LegacyBalsamMPIExecutor() else: from libensemble.executors.mpi_executor import MPIExecutor exctr = MPIExecutor() diff --git a/libensemble/tests/unit_tests/test_executor.py b/libensemble/tests/unit_tests/test_executor.py index 751394508..beb7f2aea 100644 --- a/libensemble/tests/unit_tests/test_executor.py +++ b/libensemble/tests/unit_tests/test_executor.py @@ -67,8 +67,8 @@ def build_simfuncs(): def setup_executor(): """Set up an MPI Executor with sim app""" if USE_BALSAM: - from libensemble.executors.balsam_executor import BalsamMPIExecutor - exctr = BalsamMPIExecutor() + from libensemble.executors.balsam_executor import LegacyBalsamMPIExecutor + exctr = LegacyBalsamMPIExecutor() else: from libensemble.executors.mpi_executor import MPIExecutor exctr = MPIExecutor() @@ -93,8 +93,8 @@ def setup_executor_startups(): def setup_executor_noapp(): """Set up an MPI Executor but do not register application""" if USE_BALSAM: - from libensemble.executors.balsam_executor import BalsamMPIExecutor - exctr = BalsamMPIExecutor() + from libensemble.executors.balsam_executor import LegacyBalsamMPIExecutor + exctr = LegacyBalsamMPIExecutor() else: from libensemble.executors.mpi_executor import MPIExecutor exctr = MPIExecutor() From 7bf0123ca5a0158fc79cf2ceb48ad25223d8effe Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 7 Mar 2022 14:20:20 -0600 Subject: [PATCH 56/93] deprecating old_balsam_tests and standalone executor tests? --- .../balsam_tests/bash_scripts/setup_balsam_tests.sh | 0 .../tests/{ => deprecated_tests}/balsam_tests/env_setup_theta.sh | 0 .../{ => deprecated_tests}/balsam_tests/readme.balsam_tests.txt | 0 libensemble/tests/{ => deprecated_tests}/balsam_tests/readme.rst | 0 .../{ => deprecated_tests}/balsam_tests/reset_balsam_tests.py | 0 .../{ => deprecated_tests}/balsam_tests/setup_balsam_tests.py | 0 .../{ => deprecated_tests}/balsam_tests/test_balsam_1__runjobs.py | 0 .../balsam_tests/test_balsam_2__workerkill.py | 0 .../balsam_tests/test_balsam_3__managerkill.py | 0 .../standalone_executor_tests/create_balsam_job.py | 0 .../{ => deprecated_tests}/standalone_executor_tests/readme.txt | 0 .../standalone_executor_tests/set.balsam.database.sh | 0 .../standalone_executor_tests/simdir/my_simtask.c | 0 .../standalone_executor_tests/simdir/my_simtask.f90 | 0 .../standalone_executor_tests/test_executor.py | 0 .../standalone_executor_tests/test_executor_manager_poll.py | 0 .../standalone_executor_tests/test_executor_multi.py | 0 17 files changed, 0 insertions(+), 0 deletions(-) rename libensemble/tests/{ => deprecated_tests}/balsam_tests/bash_scripts/setup_balsam_tests.sh (100%) rename libensemble/tests/{ => deprecated_tests}/balsam_tests/env_setup_theta.sh (100%) rename libensemble/tests/{ => deprecated_tests}/balsam_tests/readme.balsam_tests.txt (100%) rename libensemble/tests/{ => deprecated_tests}/balsam_tests/readme.rst (100%) rename libensemble/tests/{ => deprecated_tests}/balsam_tests/reset_balsam_tests.py (100%) rename libensemble/tests/{ => deprecated_tests}/balsam_tests/setup_balsam_tests.py (100%) rename libensemble/tests/{ => deprecated_tests}/balsam_tests/test_balsam_1__runjobs.py (100%) rename libensemble/tests/{ => deprecated_tests}/balsam_tests/test_balsam_2__workerkill.py (100%) rename libensemble/tests/{ => deprecated_tests}/balsam_tests/test_balsam_3__managerkill.py (100%) rename libensemble/tests/{ => deprecated_tests}/standalone_executor_tests/create_balsam_job.py (100%) rename libensemble/tests/{ => deprecated_tests}/standalone_executor_tests/readme.txt (100%) rename libensemble/tests/{ => deprecated_tests}/standalone_executor_tests/set.balsam.database.sh (100%) rename libensemble/tests/{ => deprecated_tests}/standalone_executor_tests/simdir/my_simtask.c (100%) rename libensemble/tests/{ => deprecated_tests}/standalone_executor_tests/simdir/my_simtask.f90 (100%) rename libensemble/tests/{ => deprecated_tests}/standalone_executor_tests/test_executor.py (100%) rename libensemble/tests/{ => deprecated_tests}/standalone_executor_tests/test_executor_manager_poll.py (100%) rename libensemble/tests/{ => deprecated_tests}/standalone_executor_tests/test_executor_multi.py (100%) diff --git a/libensemble/tests/balsam_tests/bash_scripts/setup_balsam_tests.sh b/libensemble/tests/deprecated_tests/balsam_tests/bash_scripts/setup_balsam_tests.sh similarity index 100% rename from libensemble/tests/balsam_tests/bash_scripts/setup_balsam_tests.sh rename to libensemble/tests/deprecated_tests/balsam_tests/bash_scripts/setup_balsam_tests.sh diff --git a/libensemble/tests/balsam_tests/env_setup_theta.sh b/libensemble/tests/deprecated_tests/balsam_tests/env_setup_theta.sh similarity index 100% rename from libensemble/tests/balsam_tests/env_setup_theta.sh rename to libensemble/tests/deprecated_tests/balsam_tests/env_setup_theta.sh diff --git a/libensemble/tests/balsam_tests/readme.balsam_tests.txt b/libensemble/tests/deprecated_tests/balsam_tests/readme.balsam_tests.txt similarity index 100% rename from libensemble/tests/balsam_tests/readme.balsam_tests.txt rename to libensemble/tests/deprecated_tests/balsam_tests/readme.balsam_tests.txt diff --git a/libensemble/tests/balsam_tests/readme.rst b/libensemble/tests/deprecated_tests/balsam_tests/readme.rst similarity index 100% rename from libensemble/tests/balsam_tests/readme.rst rename to libensemble/tests/deprecated_tests/balsam_tests/readme.rst diff --git a/libensemble/tests/balsam_tests/reset_balsam_tests.py b/libensemble/tests/deprecated_tests/balsam_tests/reset_balsam_tests.py similarity index 100% rename from libensemble/tests/balsam_tests/reset_balsam_tests.py rename to libensemble/tests/deprecated_tests/balsam_tests/reset_balsam_tests.py diff --git a/libensemble/tests/balsam_tests/setup_balsam_tests.py b/libensemble/tests/deprecated_tests/balsam_tests/setup_balsam_tests.py similarity index 100% rename from libensemble/tests/balsam_tests/setup_balsam_tests.py rename to libensemble/tests/deprecated_tests/balsam_tests/setup_balsam_tests.py diff --git a/libensemble/tests/balsam_tests/test_balsam_1__runjobs.py b/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_1__runjobs.py similarity index 100% rename from libensemble/tests/balsam_tests/test_balsam_1__runjobs.py rename to libensemble/tests/deprecated_tests/balsam_tests/test_balsam_1__runjobs.py diff --git a/libensemble/tests/balsam_tests/test_balsam_2__workerkill.py b/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_2__workerkill.py similarity index 100% rename from libensemble/tests/balsam_tests/test_balsam_2__workerkill.py rename to libensemble/tests/deprecated_tests/balsam_tests/test_balsam_2__workerkill.py diff --git a/libensemble/tests/balsam_tests/test_balsam_3__managerkill.py b/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_3__managerkill.py similarity index 100% rename from libensemble/tests/balsam_tests/test_balsam_3__managerkill.py rename to libensemble/tests/deprecated_tests/balsam_tests/test_balsam_3__managerkill.py diff --git a/libensemble/tests/standalone_executor_tests/create_balsam_job.py b/libensemble/tests/deprecated_tests/standalone_executor_tests/create_balsam_job.py similarity index 100% rename from libensemble/tests/standalone_executor_tests/create_balsam_job.py rename to libensemble/tests/deprecated_tests/standalone_executor_tests/create_balsam_job.py diff --git a/libensemble/tests/standalone_executor_tests/readme.txt b/libensemble/tests/deprecated_tests/standalone_executor_tests/readme.txt similarity index 100% rename from libensemble/tests/standalone_executor_tests/readme.txt rename to libensemble/tests/deprecated_tests/standalone_executor_tests/readme.txt diff --git a/libensemble/tests/standalone_executor_tests/set.balsam.database.sh b/libensemble/tests/deprecated_tests/standalone_executor_tests/set.balsam.database.sh similarity index 100% rename from libensemble/tests/standalone_executor_tests/set.balsam.database.sh rename to libensemble/tests/deprecated_tests/standalone_executor_tests/set.balsam.database.sh diff --git a/libensemble/tests/standalone_executor_tests/simdir/my_simtask.c b/libensemble/tests/deprecated_tests/standalone_executor_tests/simdir/my_simtask.c similarity index 100% rename from libensemble/tests/standalone_executor_tests/simdir/my_simtask.c rename to libensemble/tests/deprecated_tests/standalone_executor_tests/simdir/my_simtask.c diff --git a/libensemble/tests/standalone_executor_tests/simdir/my_simtask.f90 b/libensemble/tests/deprecated_tests/standalone_executor_tests/simdir/my_simtask.f90 similarity index 100% rename from libensemble/tests/standalone_executor_tests/simdir/my_simtask.f90 rename to libensemble/tests/deprecated_tests/standalone_executor_tests/simdir/my_simtask.f90 diff --git a/libensemble/tests/standalone_executor_tests/test_executor.py b/libensemble/tests/deprecated_tests/standalone_executor_tests/test_executor.py similarity index 100% rename from libensemble/tests/standalone_executor_tests/test_executor.py rename to libensemble/tests/deprecated_tests/standalone_executor_tests/test_executor.py diff --git a/libensemble/tests/standalone_executor_tests/test_executor_manager_poll.py b/libensemble/tests/deprecated_tests/standalone_executor_tests/test_executor_manager_poll.py similarity index 100% rename from libensemble/tests/standalone_executor_tests/test_executor_manager_poll.py rename to libensemble/tests/deprecated_tests/standalone_executor_tests/test_executor_manager_poll.py diff --git a/libensemble/tests/standalone_executor_tests/test_executor_multi.py b/libensemble/tests/deprecated_tests/standalone_executor_tests/test_executor_multi.py similarity index 100% rename from libensemble/tests/standalone_executor_tests/test_executor_multi.py rename to libensemble/tests/deprecated_tests/standalone_executor_tests/test_executor_multi.py From 230a78d46914401cc40b065993b6b56a46dc6a63 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 7 Mar 2022 14:47:58 -0600 Subject: [PATCH 57/93] additional documentation and missed renames --- libensemble/executors/balsam_executor.py | 45 ++++++++++++++++--- .../balsam_forces/run_libe_forces_balsam.py | 4 +- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/libensemble/executors/balsam_executor.py b/libensemble/executors/balsam_executor.py index 1cb5cd9f5..f034a269f 100644 --- a/libensemble/executors/balsam_executor.py +++ b/libensemble/executors/balsam_executor.py @@ -1,13 +1,46 @@ """ -This module launches and controls the running of tasks with Balsam 2, and most -notably can submit tasks from any machine, to any machine running a Balsam site. +This module launches and controls the running of tasks with Balsam_, and most +notably can submit tasks from any machine, to any machine running a Balsam site_. -In order to create a Balsam executor, the calling script should contain :: +In order to initiate a Balsam executor, the calling script should contain :: - exctr = NewBalsamExecutor() + from libensemble.executors import BalsamExecutor + exctr = BalsamExecutor() -One key difference to consider is that instead of registering paths to apps, -Balsam ApplicationDefinition instances must be registered instead. +One key difference to considser between this executor and libEnsemble's others is +that instead of registering paths to apps, Balsam ``ApplicationDefinition`` instances +must be registered instead. Furthermore, task submissions will not run until +Balsam reserves compute resources via a ``BatchJob``. This process may resemble: + + from libensemble.executors import BalsamExecutor + from balsam.api import ApplicationDefinition, BatchJob + + class HelloApp(ApplicationDefinition): + site = "my-balsam-site" + command_template = "/path/to/hello.app" + + exctr = BalsamExecutor() + exctr.register_app(HelloApp, app_name="hello") + + exctr.submit_allocation( + site_id=999, + num_nodes=4, + wall_time_min=30, + queue="debug-queue", + project="my-project", + ) + +Instances of the ``HelloApp`` application submitted by the executor within a user +function to the Balsam service will get scheduled within the reserved resource allocation. +Results, including output files, will appear in the Balsam site's ``data`` directory, +but can be `transferred back`_ via Globus_. + +*Reading Balsam's documentation is highly recommended.* + +.. _site: https://balsam.readthedocs.io/en/latest/user-guide/site-config/ +.. _Balsam: https://balsam.readthedocs.io/en/latest/ +.. _`transferred back`: https://balsam.readthedocs.io/en/latest/user-guide/transfer/ +.. _Globus: https://www.globus.org/ """ diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index b5c06e909..8a999591d 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -2,7 +2,7 @@ import numpy as np from libensemble import Ensemble -from libensemble.executors import NewBalsamExecutor +from libensemble.executors import BalsamExecutor from balsam.api import ApplicationDefinition THIS_SCRIPT_ON_THETA = True # Is this running on a personal machine, or a compute node? @@ -33,7 +33,7 @@ apps = ApplicationDefinition.load_by_site("jln_theta") RemoteForces = apps["RemoteForces"] -exctr = NewBalsamExecutor() +exctr = BalsamExecutor() exctr.register_app(RemoteForces, app_name="forces") if not THIS_SCRIPT_ON_THETA: From 841b3ade1fd749d606757ddcada55e6b00e1d992 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 7 Mar 2022 14:50:41 -0600 Subject: [PATCH 58/93] fix docstring --- libensemble/executors/balsam_executor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libensemble/executors/balsam_executor.py b/libensemble/executors/balsam_executor.py index f034a269f..d73cf871c 100644 --- a/libensemble/executors/balsam_executor.py +++ b/libensemble/executors/balsam_executor.py @@ -10,7 +10,7 @@ One key difference to considser between this executor and libEnsemble's others is that instead of registering paths to apps, Balsam ``ApplicationDefinition`` instances must be registered instead. Furthermore, task submissions will not run until -Balsam reserves compute resources via a ``BatchJob``. This process may resemble: +Balsam reserves compute resources via a ``BatchJob``. This process may resemble:: from libensemble.executors import BalsamExecutor from balsam.api import ApplicationDefinition, BatchJob @@ -41,7 +41,6 @@ class HelloApp(ApplicationDefinition): .. _Balsam: https://balsam.readthedocs.io/en/latest/ .. _`transferred back`: https://balsam.readthedocs.io/en/latest/user-guide/transfer/ .. _Globus: https://www.globus.org/ - """ import os From da1dc368639f18a081d22ee79523640148bb5d14 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 7 Mar 2022 15:09:49 -0600 Subject: [PATCH 59/93] additional docs --- libensemble/executors/balsam_executor.py | 20 ++++++++++++++++++- .../scaling_tests/balsam_forces/readme.md | 3 +++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/libensemble/executors/balsam_executor.py b/libensemble/executors/balsam_executor.py index d73cf871c..f4e1ae372 100644 --- a/libensemble/executors/balsam_executor.py +++ b/libensemble/executors/balsam_executor.py @@ -2,6 +2,9 @@ This module launches and controls the running of tasks with Balsam_, and most notably can submit tasks from any machine, to any machine running a Balsam site_. +At this time, access to Balsam is limited to those with valid organizational logins +authenticated through Globus_. + In order to initiate a Balsam executor, the calling script should contain :: from libensemble.executors import BalsamExecutor @@ -17,7 +20,7 @@ class HelloApp(ApplicationDefinition): site = "my-balsam-site" - command_template = "/path/to/hello.app" + command_template = "/path/to/hello.app {{ my_name }}" exctr = BalsamExecutor() exctr.register_app(HelloApp, app_name="hello") @@ -30,6 +33,21 @@ class HelloApp(ApplicationDefinition): project="my-project", ) +Task submissions of registered apps aren't too different from the other executors, +except Balsam expects application arguments in dictionary form. Note that these fields +match the templating syntax in the above ``ApplicationDefinition``'s ``command_template`` +field:: + + args = {"my_name": "World"} + + task = exctr.submit( + app_name="hello", + app_args=args, + num_procs=4, + num_nodes=1, + procs_per_node=4, + ) + Instances of the ``HelloApp`` application submitted by the executor within a user function to the Balsam service will get scheduled within the reserved resource allocation. Results, including output files, will appear in the Balsam site's ``data`` directory, diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/balsam_forces/readme.md index 3564f508e..baa661606 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/readme.md +++ b/libensemble/tests/scaling_tests/balsam_forces/readme.md @@ -96,6 +96,9 @@ to be transferred back to your local launch directory after every app run. The simulation function will wait for Balsam to transfer back a stat file, then determine the calc status based on the received output. +*To transfer files to Theta*, you will need to login to Globus and activate +the ``alcf#dtn_theta`` Managed Public Endpoint. + ### (Optional) Running libEnsemble as a Balsam app on compute nodes The previous instructions for running libEnsemble are understandably insufficient From 9eca5ca085082841fa97eca826188636358bbdbd Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 7 Mar 2022 16:14:57 -0600 Subject: [PATCH 60/93] additional options for jobs and batchjobs, including tags, partitions, etc. --- libensemble/executors/balsam_executor.py | 28 ++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/libensemble/executors/balsam_executor.py b/libensemble/executors/balsam_executor.py index f4e1ae372..3663bb649 100644 --- a/libensemble/executors/balsam_executor.py +++ b/libensemble/executors/balsam_executor.py @@ -315,11 +315,14 @@ def submit_allocation( job_mode="mpi", queue="local", project="local", + optional_params={}, + filter_tags={}, + partitions=[], ): """ Submits a Balsam ``BatchJob`` machine allocation request to Balsam. Corresponding Balsam applications with a matching site can be submitted to - this allocation. + this allocation. Effectively a wrapper for ``BatchJob.objects.create()``. Parameters ---------- @@ -342,6 +345,16 @@ def submit_allocation( project: String, optional Specifies the project that should be charged for the requested hours. Default: "local" + optional_params: dict, optional + Additional system-specific parameters to set, based on fields in Balsam's job-template.sh + + filter_tags: dict, optional + Directs the resultant BatchJob to only run Jobs with matching tags. + + partitions: list of dicts, optional + Divides the allocation into multiple launcher partitions, with differing + ``job_mode``, ``num_nodes``. ``filter_tags``, etc. See the Balsam docs. + Returns ------- @@ -355,6 +368,9 @@ def submit_allocation( job_mode=job_mode, queue=queue, project=project, + optional_params=optional_params, + filter_tags=filter_tags, + partitions=partitions ) self.allocations.append(allocation) @@ -407,6 +423,7 @@ def submit( dry_run=False, wait_on_start=False, extra_args={}, + tags={}, ): """Initializes and submits a Balsam Job based on a registered ApplicationDefinition and requested resource parameters. A corresponding libEnsemble Task object @@ -438,10 +455,10 @@ def submit( machinefile: string, optional Name of a machinefile for this task to use. Unused by Balsam - gpus_per_rank: int + gpus_per_rank: int, optional Number of GPUs to reserve for each MPI rank - transfers: dict + transfers: dict, optional A Job-specific Balsam transfers dictionary that corresponds with an ApplicationDefinition ``transfers`` field. See the Balsam docs for more information. @@ -458,9 +475,12 @@ def submit( Whether to block, and wait for task to be polled as RUNNING (or other active/end state) before continuing - extra_args: dict + extra_args: dict, optional Additional arguments to supply to MPI runner. + tags: dict, optional + Additional tags to organize the Job or restrict which BatchJobs run it. + Returns ------- From b9d0271a064f315f4d31395910ee03b9b915b45b Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 7 Mar 2022 16:22:10 -0600 Subject: [PATCH 61/93] fix legacy balsam test --- libensemble/tests/regression_tests/script_test_balsam_hworld.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/regression_tests/script_test_balsam_hworld.py b/libensemble/tests/regression_tests/script_test_balsam_hworld.py index 0b5b9488f..80f84ab6a 100644 --- a/libensemble/tests/regression_tests/script_test_balsam_hworld.py +++ b/libensemble/tests/regression_tests/script_test_balsam_hworld.py @@ -7,7 +7,7 @@ import mpi4py from mpi4py import MPI -from libensemble.executors.balsam_executor import LegacyBalsamMPIExecutor +from libensemble.executors.legacy_balsam_executor import LegacyBalsamMPIExecutor from libensemble.message_numbers import WORKER_DONE, WORKER_KILL_ON_ERR, WORKER_KILL_ON_TIMEOUT, TASK_FAILED from libensemble.libE import libE from libensemble.sim_funcs.executor_hworld import executor_hworld From 0b70819ea018434e77664e5f9c1c8cf6e255ebe5 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 8 Mar 2022 10:28:48 -0600 Subject: [PATCH 62/93] some docs clarifications, monospace classes and functions throughout --- libensemble/executors/balsam_executor.py | 113 +++++++++++------------ 1 file changed, 56 insertions(+), 57 deletions(-) diff --git a/libensemble/executors/balsam_executor.py b/libensemble/executors/balsam_executor.py index 3663bb649..a7e5d7679 100644 --- a/libensemble/executors/balsam_executor.py +++ b/libensemble/executors/balsam_executor.py @@ -10,13 +10,14 @@ from libensemble.executors import BalsamExecutor exctr = BalsamExecutor() -One key difference to considser between this executor and libEnsemble's others is -that instead of registering paths to apps, Balsam ``ApplicationDefinition`` instances -must be registered instead. Furthermore, task submissions will not run until -Balsam reserves compute resources via a ``BatchJob``. This process may resemble:: +Key differences to consider between this executor and libEnsemble's others is +Balsam ``ApplicationDefinition`` instances are registered instead of paths and task +submissions will not run until Balsam reserves compute resources at a site. + +This process may resemble:: from libensemble.executors import BalsamExecutor - from balsam.api import ApplicationDefinition, BatchJob + from balsam.api import ApplicationDefinition class HelloApp(ApplicationDefinition): site = "my-balsam-site" @@ -26,8 +27,8 @@ class HelloApp(ApplicationDefinition): exctr.register_app(HelloApp, app_name="hello") exctr.submit_allocation( - site_id=999, - num_nodes=4, + site_id=999, # corresponds to "my-balsam-site", found via ``balsam site ls`` + num_nodes=4, # Total number of nodes requested for *all jobs* wall_time_min=30, queue="debug-queue", project="my-project", @@ -35,7 +36,7 @@ class HelloApp(ApplicationDefinition): Task submissions of registered apps aren't too different from the other executors, except Balsam expects application arguments in dictionary form. Note that these fields -match the templating syntax in the above ``ApplicationDefinition``'s ``command_template`` +must match the templating syntax in each ``ApplicationDefinition``'s ``command_template`` field:: args = {"my_name": "World"} @@ -48,12 +49,13 @@ class HelloApp(ApplicationDefinition): procs_per_node=4, ) -Instances of the ``HelloApp`` application submitted by the executor within a user -function to the Balsam service will get scheduled within the reserved resource allocation. -Results, including output files, will appear in the Balsam site's ``data`` directory, -but can be `transferred back`_ via Globus_. +Application instances submitted by the executor to the Balsam service will get +scheduled within the reserved resource allocation. **Each Balsam app can only be +submitted to the site specified in its class definition.** Output files will appear +in the Balsam site's ``data`` directory, but can be automatically `transferred back`_ +via Globus. -*Reading Balsam's documentation is highly recommended.* +**Reading Balsam's documentation is highly recommended.** .. _site: https://balsam.readthedocs.io/en/latest/user-guide/site-config/ .. _Balsam: https://balsam.readthedocs.io/en/latest/ @@ -84,11 +86,11 @@ class HelloApp(ApplicationDefinition): class BalsamTask(Task): - """Wraps a Balsam Job from the Balsam service. + """Wraps a Balsam ``Job`` from the Balsam service. The same attributes and query routines are implemented. Use ``task.process`` - to refer to the matching Balsam Job initialized by the NewBalsamExecutor, - with every Balsam Job method invocable on it. Otherwise, libEnsemble task methods + to refer to the matching Balsam ``Job`` initialized by the ``BalsamExecutor``, + with every Balsam ``Job`` method invocable on it. Otherwise, libEnsemble task methods like ``poll()`` can be used directly. """ @@ -102,9 +104,9 @@ def __init__( stderr=None, workerid=None, ): - """Instantiate a new NewLegacyBalsamTask instance. + """Instantiate a new ``BalsamTask`` instance. - A new BalsamTask object is created with an id, status and + A new ``BalsamTask`` object is created with an id, status and configuration attributes. This will normally be created by the executor on a submission. """ @@ -112,7 +114,7 @@ def __init__( Task.__init__(self, app, app_args, workdir, stdout, stderr, workerid) def _get_time_since_balsam_submit(self): - """Return time since balsam task entered RUNNING state""" + """Return time since balsam task entered ``RUNNING`` state""" event_query = EventLog.objects.filter(job_id=self.process.id, to_state="RUNNING") if not len(event_query): @@ -203,7 +205,7 @@ def poll(self): ) def wait(self, timeout=None): - """Waits on completion of the task or raises TimeoutExpired exception + """Waits on completion of the task or raises ``TimeoutExpired``. Status attributes of task are updated on completion. @@ -211,7 +213,7 @@ def wait(self, timeout=None): ---------- timeout: int - Time in seconds after which a TimeoutExpired exception is raised""" + Time in seconds after which a ``TimeoutExpired`` exception is raised""" if self.dry_run: return @@ -249,15 +251,15 @@ def kill(self): class BalsamExecutor(Executor): - """Inherits from Executor and wraps the Balsam service. Via this Executor, - Balsam Jobs can be submitted to Balsam sites, either local or on remote machines. + """Inherits from ``Executor`` and wraps the Balsam service. Via this Executor, + Balsam ``Jobs`` can be submitted to Balsam sites, either local or on remote machines. .. note:: Task kills are not configurable in the Balsam executor. """ def __init__(self): - """Instantiate a new BalsamExecutor instance.""" + """Instantiate a new ``BalsamExecutor`` instance.""" super().__init__() @@ -269,26 +271,26 @@ def serial_setup(self): pass def add_app(self, name, site, exepath, desc): - """Sync application with balsam service""" + """Sync application with Balsam service""" pass def register_app(self, BalsamApp, app_name, calc_type=None, desc=None): - """Registers a Balsam ApplicationDefinition to libEnsemble. This class + """Registers a Balsam ``ApplicationDefinition`` to libEnsemble. This class instance *must* have a ``site`` and ``command_template`` specified. See the Balsam docs for information on other optional fields. Parameters ---------- - BalsamApp: ApplicationDefinition object - A Balsam ApplicationDefinition instance. + BalsamApp: ``ApplicationDefinition`` object + A Balsam ``ApplicationDefinition`` instance. app_name: String, optional Name to identify this application. calc_type: String, optional - Calculation type: Set this application as the default 'sim' - or 'gen' function. + Calculation type: Set this application as the default ``'sim'`` + or ``'gen'`` function. desc: String, optional Description of this application @@ -328,28 +330,28 @@ def submit_allocation( ---------- site_id: int - The corresponding site_id for a Balsam site. Retrieve via ``balsam site ls`` + The corresponding ``site_id`` for a Balsam site. Retrieve via ``balsam site ls`` num_nodes: int The number of nodes to request from a machine with a running Balsam site wall_time_min: int - The number of walltime minutes to request for the BatchJob allocation + The number of walltime minutes to request for the ``BatchJob`` allocation job_mode: String, optional - Either "serial" or "mpi". Default: "mpi" + Either ``"serial"`` or ``"mpi"``. Default: ``"mpi"`` queue: String, optional - Specifies the queue from which the BatchJob should request nodes. Default: "local" + Specifies the queue from which the ``BatchJob`` should request nodes. Default: ``"local"`` project: String, optional - Specifies the project that should be charged for the requested hours. Default: "local" + Specifies the project that should be charged for the requested machine time. Default: ``"local"`` optional_params: dict, optional - Additional system-specific parameters to set, based on fields in Balsam's job-template.sh + Additional system-specific parameters to set, based on fields in Balsam's ``job-template.sh`` filter_tags: dict, optional - Directs the resultant BatchJob to only run Jobs with matching tags. + Directs the resultant ``BatchJob`` to only run Jobs with matching tags. partitions: list of dicts, optional Divides the allocation into multiple launcher partitions, with differing @@ -384,15 +386,15 @@ def submit_allocation( def revoke_allocation(self, allocation): """ - Terminates a Balsam BatchJob machine allocation remotely. Balsam apps should + Terminates a Balsam ``BatchJob`` machine allocation remotely. Balsam apps should no longer be submitted to this allocation. Best to run after libEnsemble - completes, or after this BatchJob is no longer needed. Helps save machine time. + completes, or after this ``BatchJob`` is no longer needed. Helps save machine time. Parameters ---------- - allocation: BatchJob object - a BatchJob with a corresponding machine allocation that should be cancelled. + allocation: ``BatchJob`` object + a ``BatchJob`` with a corresponding machine allocation that should be cancelled. """ allocation.refresh_from_db() @@ -425,16 +427,15 @@ def submit( extra_args={}, tags={}, ): - """Initializes and submits a Balsam Job based on a registered ApplicationDefinition - and requested resource parameters. A corresponding libEnsemble Task object - is created and returned. + """Initializes and submits a Balsam ``Job`` based on a registered ``ApplicationDefinition`` + and requested resources. A corresponding libEnsemble ``Task`` object is returned. calc_type: String, optional - The calculation type: 'sim' or 'gen' - Only used if app_name is not supplied. Uses default sim or gen application. + The calculation type: ``'sim'`` or ``'gen'`` + Only used if ``app_name`` is not supplied. Uses default sim or gen application. app_name: String, optional - The application name. Must be supplied if calc_type is not. + The application name. Must be supplied if ``calc_type`` is not. app_args: dict A dictionary of options that correspond to fields to template in the @@ -460,26 +461,26 @@ def submit( transfers: dict, optional A Job-specific Balsam transfers dictionary that corresponds with an - ApplicationDefinition ``transfers`` field. See the Balsam docs for + ``ApplicationDefinition`` ``transfers`` field. See the Balsam docs for more information. workdir: String Specifies as name for the Job's output directory within the Balsam site's - data directory. Default: libe_workflow + data directory. Default: ``libe_workflow`` dry_run: boolean, optional - Whether this is a dry_run - no task will be launched; instead - runline is printed to logger (at INFO level) + Whether this is a dry run - no task will be launched; instead + runline is printed to logger (at ``INFO`` level) wait_on_start: boolean, optional - Whether to block, and wait for task to be polled as RUNNING (or other + Whether to block, and wait for task to be polled as ``RUNNING`` (or other active/end state) before continuing extra_args: dict, optional Additional arguments to supply to MPI runner. tags: dict, optional - Additional tags to organize the Job or restrict which BatchJobs run it. + Additional tags to organize the ``Job`` or restrict which ``BatchJobs`` run it. Returns ------- @@ -488,9 +489,9 @@ def submit( The launched task object Note that since Balsam Jobs are often sent to entirely different machines - than where libEnsemble is running, that how libEnsemble's resource manager + than where libEnsemble is running, how libEnsemble's resource manager has divided local resources among workers doesn't impact what resources - can be requested for a Balsam Job running on an entirely different machine. + can be requested for a Balsam ``Job`` running on an entirely different machine. """ @@ -506,7 +507,6 @@ def submit( else: workdir = self.workflow_name - # Specific to this class if machinefile is not None: logger.warning("machinefile arg ignored - not supported in Balsam") jassert( @@ -553,6 +553,5 @@ def submit( "Submitted Balsam App to site {}: " "nodes {} ppn {}".format(App.site, num_nodes, procs_per_node) ) - # task.workdir = task.process.working_directory # Might not be set yet! self.list_of_tasks.append(task) return task From b58c1cd32e5c400526a313cfe048be90e7996d0f Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 8 Mar 2022 12:26:22 -0600 Subject: [PATCH 63/93] detect via hostname if submission script running on theta --- .../scaling_tests/balsam_forces/run_libe_forces_balsam.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 8a999591d..00062aaa7 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -1,11 +1,12 @@ #!/usr/bin/env python +import socket import numpy as np from libensemble import Ensemble from libensemble.executors import BalsamExecutor from balsam.api import ApplicationDefinition -THIS_SCRIPT_ON_THETA = True # Is this running on a personal machine, or a compute node? +THIS_SCRIPT_ON_THETA = any([i in socket.gethostname() for i in ["theta", "nid0"]]) # Is this running on a personal machine, or a compute node? # Use Globus to transfer output forces.stat files back TRANSFER_STATFILES = True From 3c3e48ba6a156f3e14993d5355399f9f08febfd9 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 8 Mar 2022 12:40:07 -0600 Subject: [PATCH 64/93] Black for scaling --- .../tests/scaling_tests/balsam_forces/forces_simf.py | 8 ++------ .../balsam_forces/run_libe_forces_balsam.py | 9 ++++----- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index b62acc206..8bce1e1a8 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -88,9 +88,7 @@ def read_last_line(filepath): if THIS_SCRIPT_ON_THETA: statfile = "../" + workdir + "/" + file_dest.split("/")[-1] if read_last_line(statfile) == "kill": - print( - "Warning: Task completed although marked as a bad run (kill flag set in forces.stat)" - ) + print("Warning: Task completed although marked as a bad run (kill flag set in forces.stat)") calc_status = TASK_FAILED else: calc_status = WORKER_DONE @@ -105,9 +103,7 @@ def read_last_line(filepath): else: if TRANSFER_STATFILES: print("Waiting for Task {} statfile.".format(task.name)) - while file_dest not in [ - os.path.join(os.getcwd(), i) for i in os.listdir(".") - ]: + while file_dest not in [os.path.join(os.getcwd(), i) for i in os.listdir(".")]: time.sleep(1) if read_last_line(file_dest) == "kill": diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 00062aaa7..f58d8b332 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -6,15 +6,14 @@ from libensemble.executors import BalsamExecutor from balsam.api import ApplicationDefinition -THIS_SCRIPT_ON_THETA = any([i in socket.gethostname() for i in ["theta", "nid0"]]) # Is this running on a personal machine, or a compute node? +THIS_SCRIPT_ON_THETA = any( + [i in socket.gethostname() for i in ["theta", "nid0"]] +) # Is this running on a personal machine, or a compute node? # Use Globus to transfer output forces.stat files back TRANSFER_STATFILES = True GLOBUS_ENDPOINT = "jln_laptop" -GLOBUS_DEST_DIR = ( - "/Users/jnavarro/Desktop/libensemble" - + "/libensemble/libensemble/tests/scaling_tests/balsam_forces" -) +GLOBUS_DEST_DIR = "/Users/jnavarro/Desktop/libensemble" + "/libensemble/libensemble/tests/scaling_tests/balsam_forces" forces = Ensemble() forces.from_yaml("balsam_forces.yaml") From d5dafcda1e68bb7c942a0b39f173f6863c17824d Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Tue, 8 Mar 2022 12:41:13 -0600 Subject: [PATCH 65/93] Black on deprecated tests --- .../balsam_tests/setup_balsam_tests.py | 29 +++++++++--------- .../balsam_tests/test_balsam_1__runjobs.py | 30 +++++++++++-------- .../balsam_tests/test_balsam_2__workerkill.py | 30 +++++++++++-------- .../test_balsam_3__managerkill.py | 30 +++++++++++-------- 4 files changed, 66 insertions(+), 53 deletions(-) diff --git a/libensemble/tests/deprecated_tests/balsam_tests/setup_balsam_tests.py b/libensemble/tests/deprecated_tests/balsam_tests/setup_balsam_tests.py index 00c8052e1..0ed941443 100755 --- a/libensemble/tests/deprecated_tests/balsam_tests/setup_balsam_tests.py +++ b/libensemble/tests/deprecated_tests/balsam_tests/setup_balsam_tests.py @@ -12,15 +12,16 @@ import balsam.launcher.dag as dag from balsam.service import models + AppDef = models.ApplicationDefinition # Ok so more low level - but can interface app stuff in python directly def add_app(name, exepath, desc): - """ Add application to database """ + """Add application to database""" app = AppDef() app.name = name - app.executable = exepath # “/full/path/to/python/interpreter /full/path/to/script.py" + app.executable = exepath # “/full/path/to/python/interpreter /full/path/to/script.py" app.description = desc # app.default_preprocess = '' # optional # app.default_postprocess = '' # optional @@ -30,14 +31,14 @@ def add_app(name, exepath, desc): # As balsam req python 3.6 lets use subprocess.run # For any stuff requiring CLI def run_cmd(cmd, echo=False): - """ Run a bash command """ + """Run a bash command""" if echo: print("\nRunning %s ...\n" % cmd) try: subprocess.run(cmd.split(), check=True) except Exception as e: print(e) - raise("Error: Command %s failed to run" % cmd) + raise ("Error: Command %s failed to run" % cmd) # Use relative paths to balsam_tests dir @@ -50,9 +51,7 @@ def run_cmd(cmd, echo=False): num_nodes = 1 procs_per_node = 4 -job_list = ['test_balsam_1__runjobs.py', - 'test_balsam_2__workerkill.py', - 'test_balsam_3__managerkill.py'] +job_list = ['test_balsam_1__runjobs.py', 'test_balsam_2__workerkill.py', 'test_balsam_3__managerkill.py'] # Currently think only CLI interface for this stuff?? @@ -80,13 +79,15 @@ def run_cmd(cmd, echo=False): add_app(app_name, run_line, app_desc) job_name = 'job_' + app_name - dag.add_job(name=job_name, - workflow="libe_workflow", - application=app_name, - num_nodes=num_nodes, - procs_per_node=procs_per_node, - stage_out_url="local:" + work_dir, - stage_out_files=job_name + ".out") + dag.add_job( + name=job_name, + workflow="libe_workflow", + application=app_name, + num_nodes=num_nodes, + procs_per_node=procs_per_node, + stage_out_url="local:" + work_dir, + stage_out_files=job_name + ".out", + ) # Add dependency between jobs so run one at a time. if prev_job_name: diff --git a/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_1__runjobs.py b/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_1__runjobs.py index 3e9592fe8..fe339aa08 100644 --- a/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_1__runjobs.py +++ b/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_1__runjobs.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -import os # for adding to path +import os # for adding to path import time from mpi4py import MPI @@ -32,7 +32,7 @@ def poll_until_state(job, state, timeout_sec=60.0, delay=2.0): os.mkdir(sim_path) except Exception as e: print(e) - raise("Cannot make simulation directory %s" % sim_path) + raise ("Cannot make simulation directory %s" % sim_path) MPI.COMM_WORLD.Barrier() # Ensure output dir created print("Host job rank is %d Output dir is %s" % (myrank, sim_input_dir)) @@ -41,20 +41,24 @@ def poll_until_state(job, state, timeout_sec=60.0, delay=2.0): for sim_id in range(steps): jobname = 'outfile_t1_' + 'for_sim_id_' + str(sim_id) + '_ranks_' + str(myrank) + '.txt' - current_job = dag.add_job(name=jobname, - workflow="libe_workflow", - application="helloworld", - application_args=str(sleep_time), - num_nodes=1, - procs_per_node=8, - stage_out_url="local:" + sim_path, - stage_out_files=jobname + ".out") + current_job = dag.add_job( + name=jobname, + workflow="libe_workflow", + application="helloworld", + application_args=str(sleep_time), + num_nodes=1, + procs_per_node=8, + stage_out_url="local:" + sim_path, + stage_out_files=jobname + ".out", + ) success = poll_until_state(current_job, 'JOB_FINISHED') # OR job killed if success: - print("Completed job: %s rank=%d time=%f" % (jobname, myrank, time.time()-start)) + print("Completed job: %s rank=%d time=%f" % (jobname, myrank, time.time() - start)) else: - print("Task not completed: %s rank=%d time=%f Status" % (jobname, myrank, time.time()-start), current_job.state) + print( + "Task not completed: %s rank=%d time=%f Status" % (jobname, myrank, time.time() - start), current_job.state + ) end = time.time() -print("Done: rank=%d time=%f" % (myrank, end-start)) +print("Done: rank=%d time=%f" % (myrank, end - start)) diff --git a/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_2__workerkill.py b/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_2__workerkill.py index 50f8ae311..2a20928a4 100644 --- a/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_2__workerkill.py +++ b/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_2__workerkill.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -import os # for adding to path +import os # for adding to path import time from mpi4py import MPI @@ -34,7 +34,7 @@ def poll_until_state(job, state, timeout_sec=120.0, delay=2.0): os.mkdir(sim_path) except Exception as e: print(e) - raise("Cannot make simulation directory %s" % sim_path) + raise ("Cannot make simulation directory %s" % sim_path) MPI.COMM_WORLD.Barrier() # Ensure output dir created print("Host job rank is %d Output dir is %s" % (myrank, sim_input_dir)) @@ -43,22 +43,26 @@ def poll_until_state(job, state, timeout_sec=120.0, delay=2.0): for sim_id in range(steps): jobname = 'outfile_t2_' + 'for_sim_id_' + str(sim_id) + '_ranks_' + str(myrank) + '.txt' - current_job = dag.add_job(name=jobname, - workflow="libe_workflow", - application="helloworld", - application_args=str(sleep_time), - num_nodes=1, - procs_per_node=8, - stage_out_url="local:" + sim_path, - stage_out_files=jobname + ".out") + current_job = dag.add_job( + name=jobname, + workflow="libe_workflow", + application="helloworld", + application_args=str(sleep_time), + num_nodes=1, + procs_per_node=8, + stage_out_url="local:" + sim_path, + stage_out_files=jobname + ".out", + ) if sim_id == 1: dag.kill(current_job) success = poll_until_state(current_job, 'JOB_FINISHED') # OR job killed if success: - print("Completed job: %s rank=%d time=%f" % (jobname, myrank, time.time()-start)) + print("Completed job: %s rank=%d time=%f" % (jobname, myrank, time.time() - start)) else: - print("Task not completed: %s rank=%d time=%f Status" % (jobname, myrank, time.time()-start), current_job.state) + print( + "Task not completed: %s rank=%d time=%f Status" % (jobname, myrank, time.time() - start), current_job.state + ) end = time.time() -print("Done: rank=%d time=%f" % (myrank, end-start)) +print("Done: rank=%d time=%f" % (myrank, end - start)) diff --git a/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_3__managerkill.py b/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_3__managerkill.py index dc6cfd8b9..5990e6c5d 100644 --- a/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_3__managerkill.py +++ b/libensemble/tests/deprecated_tests/balsam_tests/test_balsam_3__managerkill.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -import os # for adding to path +import os # for adding to path import time from mpi4py import MPI @@ -34,7 +34,7 @@ def poll_until_state(job, state, timeout_sec=120.0, delay=2.0): os.mkdir(sim_path) except Exception as e: print(e) - raise("Cannot make simulation directory %s" % sim_path) + raise ("Cannot make simulation directory %s" % sim_path) MPI.COMM_WORLD.Barrier() # Ensure output dir created print("Host job rank is %d Output dir is %s" % (myrank, sim_input_dir)) @@ -43,14 +43,16 @@ def poll_until_state(job, state, timeout_sec=120.0, delay=2.0): for sim_id in range(steps): jobname = 'outfile_t3_' + 'for_sim_id_' + str(sim_id) + '_ranks_' + str(myrank) + '.txt' - current_job = dag.add_job(name=jobname, - workflow="libe_workflow", - application="helloworld", - application_args=str(sleep_time), - num_nodes=1, - procs_per_node=8, - stage_out_url="local:" + sim_path, - stage_out_files=jobname + ".out") + current_job = dag.add_job( + name=jobname, + workflow="libe_workflow", + application="helloworld", + application_args=str(sleep_time), + num_nodes=1, + procs_per_node=8, + stage_out_url="local:" + sim_path, + stage_out_files=jobname + ".out", + ) # Kill only from manager - pending and running jobs of given ID if myrank == 0: @@ -77,9 +79,11 @@ def poll_until_state(job, state, timeout_sec=120.0, delay=2.0): success = poll_until_state(current_job, 'JOB_FINISHED') # OR job killed if success: - print("Completed job: %s rank=%d time=%f" % (jobname, myrank, time.time()-start)) + print("Completed job: %s rank=%d time=%f" % (jobname, myrank, time.time() - start)) else: - print("Task not completed: %s rank=%d time=%f Status" % (jobname, myrank, time.time()-start), current_job.state) + print( + "Task not completed: %s rank=%d time=%f Status" % (jobname, myrank, time.time() - start), current_job.state + ) end = time.time() -print("Done: rank=%d time=%f" % (myrank, end-start)) +print("Done: rank=%d time=%f" % (myrank, end - start)) From aef1cca57dd3ec06d351abf95e7302939a83ccbc Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 10 Mar 2022 13:36:26 -0600 Subject: [PATCH 66/93] tiny docs changes --- docs/platforms/platforms_index.rst | 2 +- libensemble/executors/balsam_executor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index 8288f540f..45eb6e838 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -178,7 +178,7 @@ key. For example:: 'sim_f': sim_f, 'in': ['x'], 'out': [('f', float)], - 'funcx_endpoint': 3af6dc24-3f27-4c49-8d11-e301ade15353, + 'funcx_endpoint': '3af6dc24-3f27-4c49-8d11-e301ade15353', } See the ``libensemble/tests/scaling_tests/funcx_forces`` directory for a complete diff --git a/libensemble/executors/balsam_executor.py b/libensemble/executors/balsam_executor.py index a7e5d7679..903448021 100644 --- a/libensemble/executors/balsam_executor.py +++ b/libensemble/executors/balsam_executor.py @@ -212,7 +212,7 @@ def wait(self, timeout=None): Parameters ---------- - timeout: int + timeout: float Time in seconds after which a ``TimeoutExpired`` exception is raised""" if self.dry_run: From 2cbe67ac07cfc055ac60f2c77c42bb2f31fc1783 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 14 Mar 2022 13:46:21 -0500 Subject: [PATCH 67/93] update README for new balsam --- README.rst | 10 +++++----- docs/introduction_latex.rst | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 65f58e8ec..f7f046249 100644 --- a/README.rst +++ b/README.rst @@ -82,10 +82,9 @@ Optional dependencies: * Balsam_ -If running on the the compute nodes of three-tier systems -like OLCF's Summit_ or ALCF's Theta_, libEnsemble's workers may use the Balsam service -to schedule and launch MPI applications. Otherwise, libEnsemble can be run with -multiprocessing on the intermediate launch nodes. +As of v0.8.0+dev, libEnsemble features an updated `Balsam Executor`_ +for workers to schedule and launch applications to *anywhere* with a running +Balsam site, including to remote machines. * pyyaml_ @@ -294,7 +293,8 @@ See a complete list of `example user scripts`_. .. _across: https://libensemble.readthedocs.io/en/develop/platforms/platforms_index.html#funcx-remote-user-functions .. _APOSMM: https://link.springer.com/article/10.1007/s12532-017-0131-4 .. _AWA: https://link.springer.com/article/10.1007/s12532-017-0131-4 -.. _Balsam: https://www.alcf.anl.gov/support-center/theta/balsam +.. _Balsam: https://balsam.readthedocs.io/en/latest/ +.. _Balsam Executor: https://libensemble.readthedocs.io/en/develop/executor/balsam_2_executor.html .. _Community Examples repository: https://github.com/Libensemble/libe-community-examples .. _Conda: https://docs.conda.io/en/latest/ .. _conda-forge: https://conda-forge.org/ diff --git a/docs/introduction_latex.rst b/docs/introduction_latex.rst index 6eee1c949..f6b2c42da 100644 --- a/docs/introduction_latex.rst +++ b/docs/introduction_latex.rst @@ -25,7 +25,8 @@ We now present further information on running and testing libEnsemble. .. _across: https://libensemble.readthedocs.io/en/develop/platforms/platforms_index.html#funcx-remote-user-functions .. _APOSMM: https://link.springer.com/article/10.1007/s12532-017-0131-4 .. _AWA: https://link.springer.com/article/10.1007/s12532-017-0131-4 -.. _Balsam: https://www.alcf.anl.gov/support-center/theta/balsam +.. _Balsam: https://balsam.readthedocs.io/en/latest/ +.. _Balsam Executor: https://libensemble.readthedocs.io/en/develop/executor/balsam_2_executor.html .. _Community Examples repository: https://github.com/Libensemble/libe-community-examples .. _Conda: https://docs.conda.io/en/latest/ .. _conda-forge: https://conda-forge.org/ From 5823f04fe7d03654dded7b8e529e8d145592cbea Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 22 Mar 2022 10:21:19 -0500 Subject: [PATCH 68/93] rearranges scripts so Balsam Apps are defined in another script --- .../balsam_forces/define_apps.py | 66 ++++++++++++++++ .../scaling_tests/balsam_forces/readme.md | 4 +- .../balsam_forces/run_libe_forces_balsam.py | 4 +- .../submit_libe_forces_balsam.py | 77 ++++++------------- 4 files changed, 95 insertions(+), 56 deletions(-) create mode 100644 libensemble/tests/scaling_tests/balsam_forces/define_apps.py diff --git a/libensemble/tests/scaling_tests/balsam_forces/define_apps.py b/libensemble/tests/scaling_tests/balsam_forces/define_apps.py new file mode 100644 index 000000000..bbc6dfb97 --- /dev/null +++ b/libensemble/tests/scaling_tests/balsam_forces/define_apps.py @@ -0,0 +1,66 @@ +from balsam.api import ApplicationDefinition + +""" +This script uses the Balsam API to define and sync two types of Balsam apps: +a libEnsemble app, and a Forces app: + + - The libEnsemble app runs the calling script ``run_libe_forces_balsam.py``. + An input transfer is also specified, but parameterized in + ``submit_libe_forces_balsam.py`` as part of the Job specification process. + + - The Forces app is defined and synced with Balsam. The libEnsemble app + will submit instances of the Forces app to the Balsam service for scheduling + on a running batch session at its site. An optional output transfer is defined; + forces.stat files are transferred back to the Globus endpoint defined in + run_libe_forces_balsam.py + +Unless changes are made to these Apps, this should only need to be run once to +register each of these apps with the Balsam service. +""" + + +class LibensembleApp(ApplicationDefinition): + site = "jln_theta" + command_template = ( + "/home/jnavarro/.conda/envs/again/bin/python /home/jnavarro" + + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py" + + " > libe_out.txt 2>&1" + ) + + transfers = { + "input_file": { + "required": True, + "direction": "in", + "local_path": ".", + "description": "Transfer in of balsam_forces.yaml", + "recursive": False, + } + } + +print("Defined LibensembleApp Balsam ApplicationDefinition.") + +class RemoteForces(ApplicationDefinition): + site = "jln_theta" + command_template = ( + "/home/jnavarro" + + "/libensemble/libensemble/tests/scaling_tests/forces/forces.x" + + " {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}" + + " > out.txt 2>&1" + ) + + transfers = { + "result": { + "required": False, + "direction": "out", + "local_path": "forces.stat", + "description": "Forces stat file", + "recursive": False, + } + } + +print("Defined RemoteForces Balsam ApplicationDefinition.") + +LibensembleApp.sync() +RemoteForces.sync() + +print("Synced each app with the Balsam service.") diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/balsam_forces/readme.md index baa661606..4e928d73c 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/readme.md +++ b/libensemble/tests/scaling_tests/balsam_forces/readme.md @@ -47,8 +47,8 @@ to list your sites and `balsam job rm --all` to remove extraneous jobs between r ### Configuring and Running libEnsemble. -Configure the `RemoteForces` class in the `run_libe_forces_balsam.py` calling -script to match the Balsam site name and the path to the `forces.x` executable +Configure the `RemoteForces` class in the `submit_libe_forces_balsam.py` submission script + to match the Balsam site name and the path to the `forces.x` executable on the remote machine. Configure the `submit_allocation()` function in the calling script to correspond with the site's ID (an integer found via `balsam site ls`), as well as the correct queue and project for the machine the Balsam site was initialized on. diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index f58d8b332..0a724bf5f 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -6,6 +6,8 @@ from libensemble.executors import BalsamExecutor from balsam.api import ApplicationDefinition +BALSAM_SITE = "jln_theta" + THIS_SCRIPT_ON_THETA = any( [i in socket.gethostname() for i in ["theta", "nid0"]] ) # Is this running on a personal machine, or a compute node? @@ -30,7 +32,7 @@ forces.persis_info.add_random_streams() -apps = ApplicationDefinition.load_by_site("jln_theta") +apps = ApplicationDefinition.load_by_site(BALSAM_SITE) RemoteForces = apps["RemoteForces"] exctr = BalsamExecutor() diff --git a/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py index 15deef330..2d8fe18fc 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py @@ -2,54 +2,48 @@ import glob from balsam.api import ApplicationDefinition, BatchJob +""" +This file is roughly equivalent to a traditional batch submission shell script +that used legacy Balsam commands, except it uses the Balsam API to submit jobs +to the scheduler. It can also be run from anywhere and still submit jobs to +the same machine. It loads, parameterizes, and submits the LibensembleApp for +execution. +""" + +# To which Balsam site should these apps be submitted? +BALSAM_SITE = "jln_theta" + # Batch Session Parameters -SIM_MAX = 16 # make sure matches in balsam_forces.yaml BATCH_NUM_NODES = 5 BATCH_WALL_CLOCK_TIME = 60 PROJECT = "CSC250STMS07" QUEUE = "debug-flat-quad" -# libE Job Parameters - Will use above resources +# libEnsemble Job Parameters - Will use above resources LIBE_NODES = 1 LIBE_RANKS = 5 # Transfer forces.stat files back to this script's source directory? # Adjust run_libe_forces_balsam.py as well!!!! +# SIM_MAX is requested so that this script can wait for all forces.stat files, +# then cancel the remote allocation to save node hours TRANSFER_STATFILES = True +SIM_MAX = 16 # make sure matches in balsam_forces.yaml # Transfer this file to the libE Job's working directory. -# # globus_endpoint_key *specified in local balsam site's settings.yml* # globus_endpoint_key:/path/to/file +# globus_endpoint_key specified in site's settings.yml input_file = ( "jln_laptop:/Users/jnavarro/Desktop/libensemble" + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml" ) -# FOR EACH OF THE FOLLOWING APPS, make sure Balsam sites, home directories, -# pythons, and other paths are updated. - - -class LibensembleApp(ApplicationDefinition): - site = "jln_theta" - command_template = ( - "/home/jnavarro/.conda/envs/again/bin/python /home/jnavarro" - + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py" - + " > libe_out.txt 2>&1" - ) - - transfers = { - "input_file": { - "required": True, - "direction": "in", - "local_path": ".", - "description": "Transfer in of balsam_forces.yaml", - "recursive": False, - } - } - - -print("Defined LibensembleApp Balsam ApplicationDefinition.") +# Retrieve the libEnsemble app from the Balsam service +apps = ApplicationDefinition.load_by_site(BALSAM_SITE) +LibensembleApp = apps["LibensembleApp"] +# Submit the libEnsemble app as a Job to the Balsam service. +# It will wait for a running BatchJob session libe_job = LibensembleApp.submit( workdir="libe_workflow/libe_processes", num_nodes=LIBE_NODES, @@ -57,33 +51,9 @@ class LibensembleApp(ApplicationDefinition): transfers={"input_file": input_file}, ) -print("libEnsemble Job created, synced with Balsam. Will run on next BatchJob") - - -class RemoteForces(ApplicationDefinition): - site = "jln_theta" - command_template = ( - "/home/jnavarro" - + "/libensemble/libensemble/tests/scaling_tests/forces/forces.x" - + " {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}" - + " > out.txt 2>&1" - ) - - transfers = { - "result": { - "required": False, - "direction": "out", - "local_path": "forces.stat", - "description": "Forces stat file", - "recursive": False, - } - } - - -RemoteForces.sync() - -print("Defined and synced RemoteForces Balsam ApplicationDefinition.") +print("libEnsemble App retrieved and submitted a Job to Balsam service.") +# Submit an allocation (BatchJob) request to the libEnsemble app's site batch = BatchJob.objects.create( site_id=libe_job.site_id, num_nodes=BATCH_NUM_NODES, @@ -95,6 +65,7 @@ class RemoteForces(ApplicationDefinition): print("BatchJob session initialized. All Balsam apps will run in this BatchJob.") +# Wait for all forces.stat files to be transferred back, then cancel the BatchJob if TRANSFER_STATFILES: print("Waiting for all returned forces.stat files...") From 552592f837f6198cd1b054c2f0a244ac09beea81 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 22 Mar 2022 11:21:46 -0500 Subject: [PATCH 69/93] start to rearrange README in balsam_forces --- .../scaling_tests/balsam_forces/readme.md | 40 +++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/balsam_forces/readme.md index 4e928d73c..dd7496d64 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/readme.md +++ b/libensemble/tests/scaling_tests/balsam_forces/readme.md @@ -45,10 +45,44 @@ your ALCF credentials. On any machine you've installed and logged into Balsam, you can run `balsam site ls` to list your sites and `balsam job rm --all` to remove extraneous jobs between runs. -### Configuring and Running libEnsemble. +### Configuring libEnsemble -Configure the `RemoteForces` class in the `submit_libe_forces_balsam.py` submission script - to match the Balsam site name and the path to the `forces.x` executable +There are several scripts that each need to be adjusted. To explain each: + +1. ``define_apps.py``: + + About: + + This script defines and syncs each Balsam app with the Balsam service. A Balsam + app is an ``ApplicationDefinition`` class with of ``site`` and + ``command_template`` fields. ``site`` specifies to Balsam on which Balsam site + the app should be run, and ``command_template`` specifies the command that should + be executed, as a Jinja2 string template. + + Configuring: + + Adjust the ``site`` field in each App to match your remote Balsam site. Adjust + the various paths in the ``command_template`` fields to match your home directory + or paths to your pythons. + + **Run this script each time you edit it** + +2. ``submit_libe_forces_balsam.py``: + + About: + + Configuring: + +3. ``run_libe_forces_balsam.py``: + + About: + + Configuring: + + + +Configure the `RemoteForces` class in the `define_apps.py` submission script +to match the Balsam site name and the path to the `forces.x` executable on the remote machine. Configure the `submit_allocation()` function in the calling script to correspond with the site's ID (an integer found via `balsam site ls`), as well as the correct queue and project for the machine the Balsam site was initialized on. From 09240bfc855737a4df33d1bb5ea2e26330ffbe87 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 23 Mar 2022 13:59:22 -0500 Subject: [PATCH 70/93] some comments reorginzation --- .../scaling_tests/balsam_forces/readme.md | 29 ++++++++++++------- .../submit_libe_forces_balsam.py | 19 +++++------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/balsam_forces/readme.md index dd7496d64..43dc1fb68 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/readme.md +++ b/libensemble/tests/scaling_tests/balsam_forces/readme.md @@ -53,34 +53,43 @@ There are several scripts that each need to be adjusted. To explain each: About: - This script defines and syncs each Balsam app with the Balsam service. A Balsam - app is an ``ApplicationDefinition`` class with of ``site`` and + This script defines and syncs each of our Balsam apps with the Balsam service. A Balsam + app is an ``ApplicationDefinition`` class with ``site`` and ``command_template`` fields. ``site`` specifies to Balsam on which Balsam site - the app should be run, and ``command_template`` specifies the command that should - be executed, as a Jinja2 string template. + the app should be run, and ``command_template`` specifies the command (as a Jinja2 + string template) that should be executed. This script contains two apps, ``LibensembleApp`` and ``RemoteForces``. Configuring: - Adjust the ``site`` field in each App to match your remote Balsam site. Adjust - the various paths in the ``command_template`` fields to match your home directory - or paths to your pythons. + Adjust the ``site`` field in each ``ApplicationDefinition`` to match your remote + Balsam site. Adjust the various paths in the ``command_template`` fields to match + your home directory and/or Python paths. - **Run this script each time you edit it** + **Run this script each time you edit it,** since changes to each + ``ApplicationDefinition`` need to be synced with the Balsam service. -2. ``submit_libe_forces_balsam.py``: +2. ``run_libe_forces_balsam.py``: About: + This is a typical libEnsemble calling script, but uses the BalsamExecutor + to register + Configuring: -3. ``run_libe_forces_balsam.py``: +3. (optional) ``submit_libe_forces_balsam.py``: About: + This Python script is effectively a batch submission script, capable of checking + out resources + Configuring: + + Configure the `RemoteForces` class in the `define_apps.py` submission script to match the Balsam site name and the path to the `forces.x` executable on the remote machine. Configure the `submit_allocation()` function in the calling diff --git a/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py index 2d8fe18fc..14707ef3f 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py @@ -10,7 +10,6 @@ execution. """ -# To which Balsam site should these apps be submitted? BALSAM_SITE = "jln_theta" # Batch Session Parameters @@ -23,27 +22,25 @@ LIBE_NODES = 1 LIBE_RANKS = 5 -# Transfer forces.stat files back to this script's source directory? -# Adjust run_libe_forces_balsam.py as well!!!! -# SIM_MAX is requested so that this script can wait for all forces.stat files, -# then cancel the remote allocation to save node hours -TRANSFER_STATFILES = True -SIM_MAX = 16 # make sure matches in balsam_forces.yaml - -# Transfer this file to the libE Job's working directory. +# Parameter file for calling script. Must be transferred to Balsam site. # globus_endpoint_key:/path/to/file -# globus_endpoint_key specified in site's settings.yml +# globus_endpoint_key specified in BALSAM_SITE's settings.yml input_file = ( "jln_laptop:/Users/jnavarro/Desktop/libensemble" + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml" ) +# Transfer forces.stat files back to this script's directory? +# If True, this script cancels remote allocation once SIM_MAX statfiles transferred +TRANSFER_STATFILES = True +SIM_MAX = 16 # must match balsam_forces.yaml + # Retrieve the libEnsemble app from the Balsam service apps = ApplicationDefinition.load_by_site(BALSAM_SITE) LibensembleApp = apps["LibensembleApp"] # Submit the libEnsemble app as a Job to the Balsam service. -# It will wait for a running BatchJob session +# It will wait for a compatible, running BatchJob session (remote allocation) libe_job = LibensembleApp.submit( workdir="libe_workflow/libe_processes", num_nodes=LIBE_NODES, From 141ebcdd84dfb59d96a86fa86f2eedb6ca45d562 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Thu, 24 Mar 2022 11:20:39 -0500 Subject: [PATCH 71/93] Docs/cleanup deprecated tests (#756) * Blacking deprecated tests * More cleanup of deprecated tests * More cleanup of deprecated tests * Black --- .../test_nan_func_old_aposmm.py | 27 ++++---- .../deprecated_tests/test_old_aposmm_logic.py | 48 ++++++++------ .../test_old_aposmm_one_residual_at_a_time.py | 64 +++++++++++-------- .../test_old_aposmm_pounders.py | 39 ++++++----- .../test_old_aposmm_pounders_splitcomm.py | 41 ++++++------ .../test_old_aposmm_pounders_subcomm.py | 41 ++++++------ .../test_old_aposmm_sim_dirs.py | 64 +++++++++++-------- .../test_old_aposmm_with_gradients.py | 58 +++++++++-------- .../balsam_forces/run_libe_forces_balsam.py | 5 +- 9 files changed, 217 insertions(+), 170 deletions(-) diff --git a/libensemble/tests/deprecated_tests/test_nan_func_old_aposmm.py b/libensemble/tests/deprecated_tests/test_nan_func_old_aposmm.py index 5273872fd..2f335f3f7 100644 --- a/libensemble/tests/deprecated_tests/test_nan_func_old_aposmm.py +++ b/libensemble/tests/deprecated_tests/test_nan_func_old_aposmm.py @@ -25,19 +25,23 @@ nworkers, is_manager, libE_specs, _ = parse_args() n = 2 -sim_specs = {'sim_f': sim_f, - 'in': ['x'], - 'out': [('f', float), ('f_i', float)]} +sim_specs = { + 'sim_f': sim_f, + 'in': ['x'], + 'out': [ + ('f', float), + ('f_i', float), + ], +} gen_out += [('x', float, n), ('x_on_cube', float, n), ('obj_component', int)] -gen_specs = {'gen_f': gen_f, - 'in': [o[0] for o in gen_out] + ['f', 'f_i', 'returned'], - 'out': gen_out, - 'user': {'initial_sample_size': 5, - 'lb': -2*np.ones(n), - 'ub': 2*np.ones(n)} - } +gen_specs = { + 'gen_f': gen_f, + 'in': [o[0] for o in gen_out] + ['f', 'f_i', 'returned'], + 'out': gen_out, + 'user': {'initial_sample_size': 5, 'lb': -2 * np.ones(n), 'ub': 2 * np.ones(n)}, +} if nworkers == 3: gen_specs['user']['single_component_at_a_time'] = True @@ -50,8 +54,7 @@ exit_criteria = {'sim_max': 100, 'wallclock_max': 300} # Perform the run -H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, - libE_specs=libE_specs) +H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs) if is_manager: assert flag == 0 assert np.all(~H['local_pt']) diff --git a/libensemble/tests/deprecated_tests/test_old_aposmm_logic.py b/libensemble/tests/deprecated_tests/test_old_aposmm_logic.py index ab05c62eb..fe638b584 100644 --- a/libensemble/tests/deprecated_tests/test_old_aposmm_logic.py +++ b/libensemble/tests/deprecated_tests/test_old_aposmm_logic.py @@ -4,20 +4,22 @@ n = 2 -gen_out = [('x', float, n), - ('x_on_cube', float, n), - ('sim_id', int), - ('priority', float), - ('local_pt', bool), - ('known_to_aposmm', bool), - ('dist_to_unit_bounds', float), - ('dist_to_better_l', float), - ('dist_to_better_s', float), - ('ind_of_better_l', int), - ('ind_of_better_s', int), - ('started_run', bool), - ('num_active_runs', int), - ('local_min', bool)] +gen_out = [ + ('x', float, n), + ('x_on_cube', float, n), + ('sim_id', int), + ('priority', float), + ('local_pt', bool), + ('known_to_aposmm', bool), + ('dist_to_unit_bounds', float), + ('dist_to_better_l', float), + ('dist_to_better_s', float), + ('ind_of_better_l', int), + ('ind_of_better_s', int), + ('started_run', bool), + ('num_active_runs', int), + ('local_min', bool), +] def test_failing_localopt_method(): @@ -77,7 +79,7 @@ def test_declare_opt(): hist, sim_specs_0, gen_specs_0, exit_criteria_0, alloc = setup.hist_setup1(n=2) try: - al.update_history_optimal(hist.H['x_on_cube'][0]+1, hist.H, np.arange(0, 10)) + al.update_history_optimal(hist.H['x_on_cube'][0] + 1, hist.H, np.arange(0, 10)) except AssertionError: assert 1, "Failed because the best point is not in H" else: @@ -105,16 +107,20 @@ def test_localopt_error_saving(): gen_specs_0['user']['ub'] = np.ones(2) gen_specs_0['user']['lb'] = np.zeros(2) - persis_info_1 = {'run_order': {0: [1, 2, 3]}, - 'old_runs': {}, - 'total_runs': 0, - 'rand_stream': np.random.default_rng(1)} + persis_info_1 = { + 'run_order': {0: [1, 2, 3]}, + 'old_runs': {}, + 'total_runs': 0, + 'rand_stream': np.random.default_rng(1), + } try: al.aposmm_logic(H, persis_info_1, gen_specs_0, _) except Exception as e: - assert e.args[0] == 'Exit code is 0, but x_new was not updated in local opt run 0 after 3 evaluations.\n'\ - 'Saving run information to: run_0_abort.pickle\nWorker crashing!' + assert ( + e.args[0] == 'Exit code is 0, but x_new was not updated in local opt run 0 after 3 evaluations.\n' + 'Saving run information to: run_0_abort.pickle\nWorker crashing!' + ) else: assert 0 diff --git a/libensemble/tests/deprecated_tests/test_old_aposmm_one_residual_at_a_time.py b/libensemble/tests/deprecated_tests/test_old_aposmm_one_residual_at_a_time.py index bc8241f56..e21a215d6 100644 --- a/libensemble/tests/deprecated_tests/test_old_aposmm_one_residual_at_a_time.py +++ b/libensemble/tests/deprecated_tests/test_old_aposmm_one_residual_at_a_time.py @@ -20,6 +20,7 @@ from libensemble.sim_funcs.chwirut1 import chwirut_eval as sim_f import libensemble.gen_funcs + libensemble.gen_funcs.rc.aposmm_optimizers = 'petsc' from libensemble.gen_funcs.old_aposmm import aposmm_logic as gen_f @@ -32,49 +33,56 @@ # Declare the run parameters/functions m = 214 n = 3 -budget = 50*m +budget = 50 * m -sim_specs = {'sim_f': sim_f, - 'in': ['x', 'obj_component'], - 'out': [('f_i', float)]} +sim_specs = { + 'sim_f': sim_f, + 'in': ['x', 'obj_component'], + 'out': [('f_i', float)], +} gen_out += [('x', float, n), ('x_on_cube', float, n), ('obj_component', int), ('f', float)] # LB tries to avoid x[1]=-x[2], which results in division by zero in chwirut. -UB = 2*np.ones(n) -LB = (-2-np.pi/10)*np.ones(n) -gen_specs = {'gen_f': gen_f, - 'in': [o[0] for o in gen_out] + ['f_i', 'returned'], - 'out': gen_out, - 'user': {'initial_sample_size': 5, - 'lb': LB, - 'ub': UB, - 'localopt_method': 'pounders', - 'dist_to_bound_multiple': 0.5, - 'single_component_at_a_time': True, - 'components': m, - 'combine_component_func': lambda x: np.sum(np.power(x, 2))} - } +UB = 2 * np.ones(n) +LB = (-2 - np.pi / 10) * np.ones(n) +gen_specs = { + 'gen_f': gen_f, + 'in': [o[0] for o in gen_out] + ['f_i', 'returned'], + 'out': gen_out, + 'user': { + 'initial_sample_size': 5, + 'lb': LB, + 'ub': UB, + 'localopt_method': 'pounders', + 'dist_to_bound_multiple': 0.5, + 'single_component_at_a_time': True, + 'components': m, + 'combine_component_func': lambda x: np.sum(np.power(x, 2)), + }, +} gen_specs['user'].update({'grtol': 1e-4, 'gatol': 1e-4, 'frtol': 1e-15, 'fatol': 1e-15}) np.random.seed(0) -gen_specs['user']['sample_points'] = np.random.uniform(0, 1, (budget, n))*(UB-LB)+LB -alloc_specs = {'alloc_f': alloc_f, - 'out': [('allocated', bool)], - 'user': {'stop_on_NaNs': True, - 'batch_mode': True, - 'num_active_gens': 1, - 'stop_partial_fvec_eval': True} - } +gen_specs['user']['sample_points'] = np.random.uniform(0, 1, (budget, n)) * (UB - LB) + LB +alloc_specs = { + 'alloc_f': alloc_f, + 'out': [('allocated', bool)], + 'user': { + 'stop_on_NaNs': True, + 'batch_mode': True, + 'num_active_gens': 1, + 'stop_partial_fvec_eval': True, + }, +} persis_info = add_unique_random_streams(persis_info, nworkers + 1) exit_criteria = {'sim_max': budget} # Perform the run -H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, - alloc_specs, libE_specs) +H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) if is_manager: assert flag == 0 diff --git a/libensemble/tests/deprecated_tests/test_old_aposmm_pounders.py b/libensemble/tests/deprecated_tests/test_old_aposmm_pounders.py index 8e6d7e320..fda1f1d67 100644 --- a/libensemble/tests/deprecated_tests/test_old_aposmm_pounders.py +++ b/libensemble/tests/deprecated_tests/test_old_aposmm_pounders.py @@ -20,6 +20,7 @@ from libensemble.sim_funcs.chwirut1 import chwirut_eval as sim_f import libensemble.gen_funcs + libensemble.gen_funcs.rc.aposmm_optimizers = 'petsc' from libensemble.gen_funcs.old_aposmm import aposmm_logic as gen_f @@ -33,25 +34,29 @@ n = 3 budget = 10 -sim_specs = {'sim_f': sim_f, - 'in': ['x'], - 'out': [('f', float), ('fvec', float, m)], - 'user': {'combine_component_func': lambda x: np.sum(np.power(x, 2))} - } +sim_specs = { + 'sim_f': sim_f, + 'in': ['x'], + 'out': [('f', float), ('fvec', float, m)], + 'user': {'combine_component_func': lambda x: np.sum(np.power(x, 2))}, +} gen_out += [('x', float, n), ('x_on_cube', float, n)] # lb tries to avoid x[1]=-x[2], which results in division by zero in chwirut. -gen_specs = {'gen_f': gen_f, - 'in': [o[0] for o in gen_out]+['f', 'fvec', 'returned'], - 'out': gen_out, - 'user': {'initial_sample_size': 5, - 'lb': (-2-np.pi/10)*np.ones(n), - 'ub': 2*np.ones(n), - 'localopt_method': 'pounders', - 'dist_to_bound_multiple': 0.5, - 'components': m} - } +gen_specs = { + 'gen_f': gen_f, + 'in': [o[0] for o in gen_out] + ['f', 'fvec', 'returned'], + 'out': gen_out, + 'user': { + 'initial_sample_size': 5, + 'lb': (-2 - np.pi / 10) * np.ones(n), + 'ub': 2 * np.ones(n), + 'localopt_method': 'pounders', + 'dist_to_bound_multiple': 0.5, + 'components': m, + }, +} gen_specs['user'].update({'grtol': 1e-4, 'gatol': 1e-4, 'frtol': 1e-15, 'fatol': 1e-15}) @@ -60,8 +65,7 @@ exit_criteria = {'sim_max': budget} # Perform the run -H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, - libE_specs=libE_specs) +H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs) if is_manager: assert flag == 0 @@ -69,6 +73,7 @@ # Calculating the Jacobian at the best point (though this information was not used by pounders) from libensemble.sim_funcs.chwirut1 import EvaluateJacobian + J = EvaluateJacobian(H['x'][np.argmin(H['f'])]) assert np.linalg.norm(J) < 2000 diff --git a/libensemble/tests/deprecated_tests/test_old_aposmm_pounders_splitcomm.py b/libensemble/tests/deprecated_tests/test_old_aposmm_pounders_splitcomm.py index 51798ea91..32244338c 100644 --- a/libensemble/tests/deprecated_tests/test_old_aposmm_pounders_splitcomm.py +++ b/libensemble/tests/deprecated_tests/test_old_aposmm_pounders_splitcomm.py @@ -21,6 +21,7 @@ from libensemble.sim_funcs.chwirut1 import chwirut_eval as sim_f import libensemble.gen_funcs + libensemble.gen_funcs.rc.aposmm_optimizers = 'petsc' from libensemble.gen_funcs.old_aposmm import aposmm_logic as gen_f @@ -31,32 +32,36 @@ num_comms = 2 # Must have at least num_comms*2 processors nworkers, is_manager, libE_specs, _ = parse_args() libE_specs['mpi_comm'], sub_comm_number = mpi_comm_split(num_comms) -is_manager = (libE_specs['mpi_comm'].Get_rank() == 0) +is_manager = libE_specs['mpi_comm'].Get_rank() == 0 # Declare the run parameters/functions m = 214 n = 3 budget = 10 -sim_specs = {'sim_f': sim_f, - 'in': ['x'], - 'out': [('f', float), ('fvec', float, m)], - 'user': {'combine_component_func': lambda x: np.sum(np.power(x, 2))} - } +sim_specs = { + 'sim_f': sim_f, + 'in': ['x'], + 'out': [('f', float), ('fvec', float, m)], + 'user': {'combine_component_func': lambda x: np.sum(np.power(x, 2))}, +} gen_out += [('x', float, n), ('x_on_cube', float, n)] # lb tries to avoid x[1]=-x[2], which results in division by zero in chwirut. -gen_specs = {'gen_f': gen_f, - 'in': [o[0] for o in gen_out]+['f', 'fvec', 'returned'], - 'out': gen_out, - 'user': {'initial_sample_size': 5, - 'lb': (-2-np.pi/10)*np.ones(n), - 'ub': 2*np.ones(n), - 'localopt_method': 'pounders', - 'dist_to_bound_multiple': 0.5, - 'components': m} - } +gen_specs = { + 'gen_f': gen_f, + 'in': [o[0] for o in gen_out] + ['f', 'fvec', 'returned'], + 'out': gen_out, + 'user': { + 'initial_sample_size': 5, + 'lb': (-2 - np.pi / 10) * np.ones(n), + 'ub': 2 * np.ones(n), + 'localopt_method': 'pounders', + 'dist_to_bound_multiple': 0.5, + 'components': m, + }, +} gen_specs['user'].update({'grtol': 1e-4, 'gatol': 1e-4, 'frtol': 1e-15, 'fatol': 1e-15}) @@ -65,8 +70,7 @@ exit_criteria = {'sim_max': budget} # Perform the run -H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, - libE_specs=libE_specs) +H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs) if is_manager: assert flag == 0 @@ -74,6 +78,7 @@ # Calculating the Jacobian at the best point (though this information was not used by pounders) from libensemble.sim_funcs.chwirut1 import EvaluateJacobian + J = EvaluateJacobian(H['x'][np.argmin(H['f'])]) assert np.linalg.norm(J) < 2000 diff --git a/libensemble/tests/deprecated_tests/test_old_aposmm_pounders_subcomm.py b/libensemble/tests/deprecated_tests/test_old_aposmm_pounders_subcomm.py index fb1e3b312..16f434a99 100644 --- a/libensemble/tests/deprecated_tests/test_old_aposmm_pounders_subcomm.py +++ b/libensemble/tests/deprecated_tests/test_old_aposmm_pounders_subcomm.py @@ -20,6 +20,7 @@ from libensemble.sim_funcs.chwirut1 import chwirut_eval as sim_f import libensemble.gen_funcs + libensemble.gen_funcs.rc.aposmm_optimizers = 'petsc' from libensemble.gen_funcs.old_aposmm import aposmm_logic as gen_f @@ -34,7 +35,7 @@ is_excluded = True is_manager = False else: - is_manager = (libE_specs['mpi_comm'].Get_rank() == 0) + is_manager = libE_specs['mpi_comm'].Get_rank() == 0 is_excluded = False # Declare the run parameters/functions @@ -42,25 +43,29 @@ n = 3 budget = 10 -sim_specs = {'sim_f': sim_f, - 'in': ['x'], - 'out': [('f', float), ('fvec', float, m)], - 'user': {'combine_component_func': lambda x: np.sum(np.power(x, 2))} - } +sim_specs = { + 'sim_f': sim_f, + 'in': ['x'], + 'out': [('f', float), ('fvec', float, m)], + 'user': {'combine_component_func': lambda x: np.sum(np.power(x, 2))}, +} gen_out += [('x', float, n), ('x_on_cube', float, n)] # lb tries to avoid x[1]=-x[2], which results in division by zero in chwirut. -gen_specs = {'gen_f': gen_f, - 'in': [o[0] for o in gen_out]+['f', 'fvec', 'returned'], - 'out': gen_out, - 'user': {'initial_sample_size': 5, - 'lb': (-2-np.pi/10)*np.ones(n), - 'ub': 2*np.ones(n), - 'localopt_method': 'pounders', - 'dist_to_bound_multiple': 0.5, - 'components': m} - } +gen_specs = { + 'gen_f': gen_f, + 'in': [o[0] for o in gen_out] + ['f', 'fvec', 'returned'], + 'out': gen_out, + 'user': { + 'initial_sample_size': 5, + 'lb': (-2 - np.pi / 10) * np.ones(n), + 'ub': 2 * np.ones(n), + 'localopt_method': 'pounders', + 'dist_to_bound_multiple': 0.5, + 'components': m, + }, +} gen_specs['user'].update({'grtol': 1e-4, 'gatol': 1e-4, 'frtol': 1e-15, 'fatol': 1e-15}) @@ -69,8 +74,7 @@ exit_criteria = {'sim_max': budget} # Perform the run -H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, - libE_specs=libE_specs) +H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs) if is_manager: assert flag == 0 @@ -78,6 +82,7 @@ # Calculating the Jacobian at the best point (though this information was not used by pounders) from libensemble.sim_funcs.chwirut1 import EvaluateJacobian + J = EvaluateJacobian(H['x'][np.argmin(H['f'])]) assert np.linalg.norm(J) < 2000 diff --git a/libensemble/tests/deprecated_tests/test_old_aposmm_sim_dirs.py b/libensemble/tests/deprecated_tests/test_old_aposmm_sim_dirs.py index 9fbaadb14..de1a41179 100644 --- a/libensemble/tests/deprecated_tests/test_old_aposmm_sim_dirs.py +++ b/libensemble/tests/deprecated_tests/test_old_aposmm_sim_dirs.py @@ -22,12 +22,15 @@ from libensemble.sim_funcs.branin.branin_obj import call_branin as sim_f import libensemble.gen_funcs + libensemble.gen_funcs.rc.aposmm_optimizers = ['nlopt', 'scipy'] from libensemble.gen_funcs.old_aposmm import aposmm_logic as gen_f -from libensemble.tests.regression_tests.support import (persis_info_2 as persis_info, - aposmm_gen_out as gen_out, - branin_vals_and_minima as M) +from libensemble.tests.regression_tests.support import ( + persis_info_2 as persis_info, + aposmm_gen_out as gen_out, + branin_vals_and_minima as M, +) from libensemble.tools import parse_args, save_libE_output, add_unique_random_streams nworkers, is_manager, libE_specs, _ = parse_args() @@ -37,36 +40,38 @@ if libE_specs['comms'] == 'tcp': sys.exit("Cannot run with tcp when repeated calls to libE -- aborting...") -sim_specs = {'sim_f': sim_f, - 'in': ['x'], - 'out': [('f', float)]} +sim_specs = {'sim_f': sim_f, 'in': ['x'], 'out': [('f', float)]} if nworkers == 3: sim_specs['user'] = {'uniform_random_pause_ub': 0.001} n = 2 gen_out += [('x', float, n), ('x_on_cube', float, n)] -gen_specs = {'gen_f': gen_f, - 'in': [o[0] for o in gen_out] + ['f', 'returned'], - 'out': gen_out, - 'user': {'lb': np.array([-5, 0]), - 'ub': np.array([10, 15]), - 'initial_sample_size': 20, - 'localopt_method': 'LN_BOBYQA', - 'dist_to_bound_multiple': 0.99, - 'xtol_rel': 1e-3, - 'min_batch_size': nworkers, - 'high_priority_to_best_localopt_runs': True, - 'max_active_runs': 3} - } +gen_specs = { + 'gen_f': gen_f, + 'in': [o[0] for o in gen_out] + ['f', 'returned'], + 'out': gen_out, + 'user': { + 'lb': np.array([-5, 0]), + 'ub': np.array([10, 15]), + 'initial_sample_size': 20, + 'localopt_method': 'LN_BOBYQA', + 'dist_to_bound_multiple': 0.99, + 'xtol_rel': 1e-3, + 'min_batch_size': nworkers, + 'high_priority_to_best_localopt_runs': True, + 'max_active_runs': 3, + }, +} persis_info = add_unique_random_streams(persis_info, nworkers + 1) persis_info_safe = deepcopy(persis_info) # Tell libEnsemble when to stop (stop_val key must be in H) -exit_criteria = {'sim_max': 150, - 'stop_val': ('f', -1)} -# end_exit_criteria_rst_tag +exit_criteria = { + 'sim_max': 150, + 'stop_val': ('f', -1), +} # Perform the run for run in range(2): @@ -78,20 +83,25 @@ exit_criteria['sim_max'] = 500 persis_info = deepcopy(persis_info_safe) - H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, - persis_info, libE_specs=libE_specs) + H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, libE_specs=libE_specs) if is_manager: M = M[M[:, -1].argsort()] # Sort by function values (last column) k = M.shape[0] tol = 1e-5 for i in range(k): - dist = np.min(np.sum((H['x'][H['local_min']]-M[i, :2])**2, 1)) + dist = np.min(np.sum((H['x'][H['local_min']] - M[i, :2]) ** 2, 1)) print(dist) assert dist < tol - print("\nAPOSMM + " + gen_specs['user']['localopt_method'] + - " found " + str(k) + " minima to tolerance " + str(tol)) + print( + "\nAPOSMM + " + + gen_specs['user']['localopt_method'] + + " found " + + str(k) + + " minima to tolerance " + + str(tol) + ) save_libE_output(H, persis_info, __file__, nworkers) shutil.rmtree(libE_specs['ensemble_dir_path']) diff --git a/libensemble/tests/deprecated_tests/test_old_aposmm_with_gradients.py b/libensemble/tests/deprecated_tests/test_old_aposmm_with_gradients.py index bc16d0812..42cedde59 100644 --- a/libensemble/tests/deprecated_tests/test_old_aposmm_with_gradients.py +++ b/libensemble/tests/deprecated_tests/test_old_aposmm_with_gradients.py @@ -22,36 +22,39 @@ from libensemble.sim_funcs.six_hump_camel import six_hump_camel as sim_f import libensemble.gen_funcs + libensemble.gen_funcs.rc.aposmm_optimizers = ['nlopt', 'petsc'] from libensemble.gen_funcs.old_aposmm import aposmm_logic as gen_f from libensemble.alloc_funcs.fast_alloc_to_aposmm import give_sim_work_first as alloc_f from libensemble.tools import parse_args, save_libE_output, add_unique_random_streams -from libensemble.tests.regression_tests.support import (persis_info_1 as persis_info, - aposmm_gen_out as gen_out, - six_hump_camel_minima as minima) +from libensemble.tests.regression_tests.support import ( + persis_info_1 as persis_info, + aposmm_gen_out as gen_out, + six_hump_camel_minima as minima, +) nworkers, is_manager, libE_specs, _ = parse_args() n = 2 -sim_specs = {'sim_f': sim_f, - 'in': ['x'], - 'out': [('f', float), ('grad', float, n)]} +sim_specs = {'sim_f': sim_f, 'in': ['x'], 'out': [('f', float), ('grad', float, n)]} gen_out += [('x', float, n), ('x_on_cube', float, n)] -gen_specs = {'gen_f': gen_f, - 'in': [o[0] for o in gen_out] + ['f', 'grad', 'returned'], - 'out': gen_out, - 'user': {'initial_sample_size': 100, - 'sample_points': np.round(minima, 1), - 'localopt_method': 'LD_MMA', - 'rk_const': 0.5*((gamma(1+(n/2))*5)**(1/n))/sqrt(pi), - 'xtol_rel': 1e-3, - 'max_active_runs': 6, - 'lb': np.array([-3, -2]), - 'ub': np.array([3, 2]) - } - } +gen_specs = { + 'gen_f': gen_f, + 'in': [o[0] for o in gen_out] + ['f', 'grad', 'returned'], + 'out': gen_out, + 'user': { + 'initial_sample_size': 100, + 'sample_points': np.round(minima, 1), + 'localopt_method': 'LD_MMA', + 'rk_const': 0.5 * ((gamma(1 + (n / 2)) * 5) ** (1 / n)) / sqrt(pi), + 'xtol_rel': 1e-3, + 'max_active_runs': 6, + 'lb': np.array([-3, -2]), + 'ub': np.array([3, 2]), + }, +} alloc_specs = {'alloc_f': alloc_f, 'out': [('allocated', bool)], 'user': {'batch_mode': True, 'num_active_gens': 1}} @@ -94,7 +97,7 @@ def libE_mpi_abort(): # give back a previously evaluated point) gen_specs['user']['ub'] = np.array([-2.9, -1.9]) gen_specs['user']['mu'] = 1e-4 - gen_specs['user']['rk_const'] = 0.01*((gamma(1+(n/2))*5)**(1/n))/sqrt(pi) + gen_specs['user']['rk_const'] = 0.01 * ((gamma(1 + (n / 2)) * 5) ** (1 / n)) / sqrt(pi) gen_specs['user']['lhs_divisions'] = 2 # APOSMM can be called when some run is incomplete alloc_specs['user'].pop('batch_mode') @@ -108,8 +111,7 @@ def libE_mpi_abort(): persis_info = deepcopy(persis_info_safe) - H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, - persis_info, alloc_specs, libE_specs) + H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) if is_manager: if flag != 0: @@ -122,10 +124,14 @@ def libE_mpi_abort(): # 1) We use their values to test APOSMM has identified all minima # 2) We use their approximate values to ensure APOSMM evaluates a # point in each minima's basin of attraction. - print(np.min(np.sum((H[H['local_min']]['x'] - m)**2, 1)), flush=True) - if np.min(np.sum((H[H['local_min']]['x'] - m)**2, 1)) > tol: + print(np.min(np.sum((H[H['local_min']]['x'] - m) ** 2, 1)), flush=True) + if np.min(np.sum((H[H['local_min']]['x'] - m) ** 2, 1)) > tol: libE_abort() - print("\nlibEnsemble with APOSMM using a gradient-based localopt method has identified the " + - str(np.shape(minima)[0]) + " minima within a tolerance " + str(tol)) + print( + "\nlibEnsemble with APOSMM using a gradient-based localopt method has identified the " + + str(np.shape(minima)[0]) + + " minima within a tolerance " + + str(tol) + ) save_libE_output(H, persis_info, __file__, nworkers) diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 0a724bf5f..b07254710 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -8,9 +8,8 @@ BALSAM_SITE = "jln_theta" -THIS_SCRIPT_ON_THETA = any( - [i in socket.gethostname() for i in ["theta", "nid0"]] -) # Is this running on a personal machine, or a compute node? +# Is this running on a personal machine, or a compute node? +THIS_SCRIPT_ON_THETA = any([i in socket.gethostname() for i in ["theta", "nid0"]]) # Use Globus to transfer output forces.stat files back TRANSFER_STATFILES = True From 51c75bcda86e7bb996880e6a799530714aa09db6 Mon Sep 17 00:00:00 2001 From: Jeffrey Larson Date: Thu, 24 Mar 2022 12:17:38 -0500 Subject: [PATCH 72/93] black --- libensemble/tests/scaling_tests/balsam_forces/define_apps.py | 3 +++ .../scaling_tests/balsam_forces/run_libe_forces_balsam.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/define_apps.py b/libensemble/tests/scaling_tests/balsam_forces/define_apps.py index bbc6dfb97..a75142bcf 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/define_apps.py +++ b/libensemble/tests/scaling_tests/balsam_forces/define_apps.py @@ -37,8 +37,10 @@ class LibensembleApp(ApplicationDefinition): } } + print("Defined LibensembleApp Balsam ApplicationDefinition.") + class RemoteForces(ApplicationDefinition): site = "jln_theta" command_template = ( @@ -58,6 +60,7 @@ class RemoteForces(ApplicationDefinition): } } + print("Defined RemoteForces Balsam ApplicationDefinition.") LibensembleApp.sync() diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index b07254710..f95363a1c 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -9,7 +9,7 @@ BALSAM_SITE = "jln_theta" # Is this running on a personal machine, or a compute node? -THIS_SCRIPT_ON_THETA = any([i in socket.gethostname() for i in ["theta", "nid0"]]) +THIS_SCRIPT_ON_THETA = any([i in socket.gethostname() for i in ["theta", "nid0"]]) # Use Globus to transfer output forces.stat files back TRANSFER_STATFILES = True From e1682c7fa5d847b1dda85ac48ea7987c2234bc25 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 24 Mar 2022 12:22:18 -0500 Subject: [PATCH 73/93] apparently fixes retrieved app's unresolved site_id --- .../scaling_tests/balsam_forces/submit_libe_forces_balsam.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py index 14707ef3f..b71b390a8 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py @@ -38,6 +38,7 @@ # Retrieve the libEnsemble app from the Balsam service apps = ApplicationDefinition.load_by_site(BALSAM_SITE) LibensembleApp = apps["LibensembleApp"] +LibensembleApp.resolve_site_id() # Submit the libEnsemble app as a Job to the Balsam service. # It will wait for a compatible, running BatchJob session (remote allocation) @@ -48,7 +49,7 @@ transfers={"input_file": input_file}, ) -print("libEnsemble App retrieved and submitted a Job to Balsam service.") +print("libEnsemble App retrieved and submitted as Job to Balsam service.") # Submit an allocation (BatchJob) request to the libEnsemble app's site batch = BatchJob.objects.create( From fba6ba288c1c7ac451967d5f18b7437d45e1af52 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 28 Mar 2022 16:57:28 -0500 Subject: [PATCH 74/93] some reformatting to emphasize how libensemble app reserved for remote runs, some reorganizing of transfers back to ensemble directory --- .../balsam_forces/balsam_forces.yaml | 2 +- .../scaling_tests/balsam_forces/cleanup.sh | 2 +- .../scaling_tests/balsam_forces/define_apps.py | 6 +++--- .../scaling_tests/balsam_forces/forces_simf.py | 7 ++++++- .../balsam_forces/run_libe_forces_balsam.py | 4 ++-- .../balsam_forces/submit_libe_forces_balsam.py | 17 ++++++++++------- 6 files changed, 23 insertions(+), 15 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml index cc1db2d8d..7183c7a7b 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml +++ b/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml @@ -1,6 +1,6 @@ libE_specs: save_every_k_gens: 1000 - profile: False + sim_dirs_make: True exit_criteria: sim_max: 16 diff --git a/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh b/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh index e3ec82dee..6f5720f96 100755 --- a/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh +++ b/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh @@ -1 +1 @@ -rm -r ensemble_* *.npy *.pickle ensemble.log lib*.txt *.stat +rm -r ensemble* *.npy *.pickle ensemble.log lib*.txt *.stat diff --git a/libensemble/tests/scaling_tests/balsam_forces/define_apps.py b/libensemble/tests/scaling_tests/balsam_forces/define_apps.py index a75142bcf..924065be8 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/define_apps.py +++ b/libensemble/tests/scaling_tests/balsam_forces/define_apps.py @@ -19,7 +19,7 @@ """ -class LibensembleApp(ApplicationDefinition): +class RemoteLibensembleApp(ApplicationDefinition): site = "jln_theta" command_template = ( "/home/jnavarro/.conda/envs/again/bin/python /home/jnavarro" @@ -38,7 +38,7 @@ class LibensembleApp(ApplicationDefinition): } -print("Defined LibensembleApp Balsam ApplicationDefinition.") +print("Defined RemoteLibensembleApp Balsam ApplicationDefinition.") class RemoteForces(ApplicationDefinition): @@ -63,7 +63,7 @@ class RemoteForces(ApplicationDefinition): print("Defined RemoteForces Balsam ApplicationDefinition.") -LibensembleApp.sync() +RemoteLibensembleApp.sync() RemoteForces.sync() print("Synced each app with the Balsam service.") diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index 8bce1e1a8..f1413d5c2 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -57,8 +57,13 @@ def read_last_line(filepath): "kill_rate": kill_rate, } workdir = "worker" + str(libE_info["workerID"]) + "_" + secrets.token_hex(nbytes=3) + forcesfile = "/forces_" + secrets.token_hex(nbytes=3) + ".stat" + + if THIS_SCRIPT_ON_THETA: + file_dest = GLOBUS_DEST_DIR + forcesfile + else: + file_dest = os.getcwd() + forcesfile - file_dest = GLOBUS_DEST_DIR + "/forces_" + secrets.token_hex(nbytes=3) + ".stat" if TRANSFER_STATFILES: transfer = {"result": GLOBUS_ENDPOINT + ":" + file_dest} else: diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index f95363a1c..5910c49b9 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -14,7 +14,7 @@ # Use Globus to transfer output forces.stat files back TRANSFER_STATFILES = True GLOBUS_ENDPOINT = "jln_laptop" -GLOBUS_DEST_DIR = "/Users/jnavarro/Desktop/libensemble" + "/libensemble/libensemble/tests/scaling_tests/balsam_forces" +GLOBUS_DEST_DIR = "/Users/jnavarro/Desktop/libensemble" + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/ensemble" forces = Ensemble() forces.from_yaml("balsam_forces.yaml") @@ -39,7 +39,7 @@ if not THIS_SCRIPT_ON_THETA: batch = exctr.submit_allocation( - site_id=246, + site_id=246, # Check if matches BALSAM_SITE with `balsam site ls` num_nodes=4, wall_time_min=30, queue="debug-flat-quad", diff --git a/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py index b71b390a8..b0e612bb4 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py @@ -1,3 +1,4 @@ +import os import time import glob from balsam.api import ApplicationDefinition, BatchJob @@ -7,7 +8,7 @@ that used legacy Balsam commands, except it uses the Balsam API to submit jobs to the scheduler. It can also be run from anywhere and still submit jobs to the same machine. It loads, parameterizes, and submits the LibensembleApp for -execution. +execution. Use this script to run libEnsemble as a Balsam Job on the compute nodes. """ BALSAM_SITE = "jln_theta" @@ -30,20 +31,21 @@ + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml" ) -# Transfer forces.stat files back to this script's directory? +# Transfer forces.stat files back to the following local destination? # If True, this script cancels remote allocation once SIM_MAX statfiles transferred TRANSFER_STATFILES = True +TRANSFER_DESTINATION = "./ensemble" SIM_MAX = 16 # must match balsam_forces.yaml # Retrieve the libEnsemble app from the Balsam service apps = ApplicationDefinition.load_by_site(BALSAM_SITE) -LibensembleApp = apps["LibensembleApp"] -LibensembleApp.resolve_site_id() +RemoteLibensembleApp = apps["RemoteLibensembleApp"] +RemoteLibensembleApp.resolve_site_id() # Submit the libEnsemble app as a Job to the Balsam service. # It will wait for a compatible, running BatchJob session (remote allocation) -libe_job = LibensembleApp.submit( - workdir="libe_workflow/libe_processes", +libe_job = RemoteLibensembleApp.submit( + workdir="libe_workflow", num_nodes=LIBE_NODES, ranks_per_node=LIBE_RANKS, transfers={"input_file": input_file}, @@ -65,9 +67,10 @@ # Wait for all forces.stat files to be transferred back, then cancel the BatchJob if TRANSFER_STATFILES: + os.makedirs(TRANSFER_DESTINATION, exist_ok=True) print("Waiting for all returned forces.stat files...") - while len(glob.glob("./*.stat")) != SIM_MAX: + while len(glob.glob(os.path.abspath(TRANSFER_DESTINATION) + "/*.stat")) != SIM_MAX: time.sleep(3) print("All forces.stat files returned. Cancelling BatchJob session.") From ea3581f6abd5b41a3c157ffb03118d3208c80700 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 28 Mar 2022 16:58:32 -0500 Subject: [PATCH 75/93] black --- .../tests/scaling_tests/balsam_forces/forces_simf.py | 8 ++++++-- .../scaling_tests/balsam_forces/run_libe_forces_balsam.py | 5 ++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py index f1413d5c2..fa927cc90 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py @@ -93,7 +93,9 @@ def read_last_line(filepath): if THIS_SCRIPT_ON_THETA: statfile = "../" + workdir + "/" + file_dest.split("/")[-1] if read_last_line(statfile) == "kill": - print("Warning: Task completed although marked as a bad run (kill flag set in forces.stat)") + print( + "Warning: Task completed although marked as a bad run (kill flag set in forces.stat)" + ) calc_status = TASK_FAILED else: calc_status = WORKER_DONE @@ -108,7 +110,9 @@ def read_last_line(filepath): else: if TRANSFER_STATFILES: print("Waiting for Task {} statfile.".format(task.name)) - while file_dest not in [os.path.join(os.getcwd(), i) for i in os.listdir(".")]: + while file_dest not in [ + os.path.join(os.getcwd(), i) for i in os.listdir(".") + ]: time.sleep(1) if read_last_line(file_dest) == "kill": diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 5910c49b9..8c4bece35 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -14,7 +14,10 @@ # Use Globus to transfer output forces.stat files back TRANSFER_STATFILES = True GLOBUS_ENDPOINT = "jln_laptop" -GLOBUS_DEST_DIR = "/Users/jnavarro/Desktop/libensemble" + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/ensemble" +GLOBUS_DEST_DIR = ( + "/Users/jnavarro/Desktop/libensemble" + + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/ensemble" +) forces = Ensemble() forces.from_yaml("balsam_forces.yaml") From 2be049bf3c43a97f6596fce9418a817ac55f3708 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 29 Mar 2022 16:58:13 -0500 Subject: [PATCH 76/93] next iteration of updating README --- .../scaling_tests/balsam_forces/readme.md | 47 ++++++++++++------- .../balsam_forces/run_libe_forces_balsam.py | 2 +- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/balsam_forces/readme.md index 43dc1fb68..23628277f 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/readme.md +++ b/libensemble/tests/scaling_tests/balsam_forces/readme.md @@ -57,13 +57,15 @@ There are several scripts that each need to be adjusted. To explain each: app is an ``ApplicationDefinition`` class with ``site`` and ``command_template`` fields. ``site`` specifies to Balsam on which Balsam site the app should be run, and ``command_template`` specifies the command (as a Jinja2 - string template) that should be executed. This script contains two apps, ``LibensembleApp`` and ``RemoteForces``. + string template) that should be executed. This script contains two apps, ``RemoteLibensembleApp`` + and ``RemoteForces``. If you're running libEnsemble on your personal machine and + only submitting the Forces app via Balsam, only ``RemoteForces`` needs adjusting. Configuring: Adjust the ``site`` field in each ``ApplicationDefinition`` to match your remote Balsam site. Adjust the various paths in the ``command_template`` fields to match - your home directory and/or Python paths. + your home directory and/or Python paths **on the remote machine**. **Run this script each time you edit it,** since changes to each ``ApplicationDefinition`` need to be synced with the Balsam service. @@ -72,39 +74,52 @@ There are several scripts that each need to be adjusted. To explain each: About: - This is a typical libEnsemble calling script, but uses the BalsamExecutor - to register + This is a typical libEnsemble plus Executor calling script, but instead of + registering paths to apps as with the MPI Executor, this script loads the + ``RemoteForces`` app synced with the Balsam service in ``define_apps.py`` + and registers it with libEnsemble's Balsam Executor. If running this + script on your personal machine, it also uses the Balsam Executor to check + out resources at a Balsam site. Configuring: + At a minimum (if not transferring statfiles), adjust the ``BALSAM_SITE`` field + to match your remote Balsam site, and fields in the in the + ``batch = exctr.submit_allocation()`` block further down. For ``site_id``, + retrieve the corresponding field with ``balsam site ls``. If this script is being + run on a remote machine, the ``forces.from_yaml()`` path can be adjusted to point to + the ``balsam_forces.yaml`` configuration file on that machine so it doesn't have + to be transferred over. + 3. (optional) ``submit_libe_forces_balsam.py``: About: - This Python script is effectively a batch submission script, capable of checking - out resources + This Python script is effectively a batch submission script. It uses the Balsam API + to check out resources at a Balsam site, and submits libEnsemble as + a Balsam Job onto those resources. Note that customizing the Globus transfer + of the ``balsam_forces.yaml`` file is necessary Configuring: - - - -Configure the `RemoteForces` class in the `define_apps.py` submission script -to match the Balsam site name and the path to the `forces.x` executable -on the remote machine. Configure the `submit_allocation()` function in the calling -script to correspond with the site's ID (an integer found via `balsam site ls`), -as well as the correct queue and project for the machine the Balsam site was initialized on. +### Running libEnsemble Then to run with local comms (multiprocessing) with one manager and `N` workers: - python run_libe_forces_funcx.py --comms local --nworkers N + python run_libe_forces_balsam.py --comms local --nworkers N To run with MPI comms using one manager and `N-1` workers: mpirun -np N python run_libe_forces.py -Application parameters can be adjusted in `funcx_forces.yaml`. +**This run libEnsemble itself in-place, with only Forces submitted to a Balsam site.** + +To run both libEnsemble and the Forces app on a Balsam site, use: + + python submit_libe_forces_balsam.py + +Application parameters can be adjusted in `balsam_forces.yaml`. Note that each function and path must be accessible and/or importable on the remote machine. Absolute paths are recommended. diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py index 8c4bece35..06e93451c 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py @@ -12,7 +12,7 @@ THIS_SCRIPT_ON_THETA = any([i in socket.gethostname() for i in ["theta", "nid0"]]) # Use Globus to transfer output forces.stat files back -TRANSFER_STATFILES = True +TRANSFER_STATFILES = False GLOBUS_ENDPOINT = "jln_laptop" GLOBUS_DEST_DIR = ( "/Users/jnavarro/Desktop/libensemble" From a7d0e8a9707e80dbc694d59c88ce424bd687eee1 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 30 Mar 2022 11:27:04 -0500 Subject: [PATCH 77/93] makes transfers False by default, completes reorganization of readme --- .../balsam_forces/define_apps.py | 2 +- .../scaling_tests/balsam_forces/readme.md | 95 ++++++++----------- .../submit_libe_forces_balsam.py | 14 ++- 3 files changed, 49 insertions(+), 62 deletions(-) diff --git a/libensemble/tests/scaling_tests/balsam_forces/define_apps.py b/libensemble/tests/scaling_tests/balsam_forces/define_apps.py index 924065be8..fdda1314d 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/define_apps.py +++ b/libensemble/tests/scaling_tests/balsam_forces/define_apps.py @@ -29,7 +29,7 @@ class RemoteLibensembleApp(ApplicationDefinition): transfers = { "input_file": { - "required": True, + "required": False, "direction": "in", "local_path": ".", "description": "Transfer in of balsam_forces.yaml", diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/balsam_forces/readme.md index 23628277f..79580675e 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/readme.md +++ b/libensemble/tests/scaling_tests/balsam_forces/readme.md @@ -75,7 +75,7 @@ There are several scripts that each need to be adjusted. To explain each: About: This is a typical libEnsemble plus Executor calling script, but instead of - registering paths to apps as with the MPI Executor, this script loads the + registering paths to apps like with the MPI Executor, this script loads the ``RemoteForces`` app synced with the Balsam service in ``define_apps.py`` and registers it with libEnsemble's Balsam Executor. If running this script on your personal machine, it also uses the Balsam Executor to check @@ -83,7 +83,7 @@ There are several scripts that each need to be adjusted. To explain each: Configuring: - At a minimum (if not transferring statfiles), adjust the ``BALSAM_SITE`` field + At a minimum (if not performing transfers), adjust the ``BALSAM_SITE`` field to match your remote Balsam site, and fields in the in the ``batch = exctr.submit_allocation()`` block further down. For ``site_id``, retrieve the corresponding field with ``balsam site ls``. If this script is being @@ -96,43 +96,57 @@ There are several scripts that each need to be adjusted. To explain each: About: This Python script is effectively a batch submission script. It uses the Balsam API - to check out resources at a Balsam site, and submits libEnsemble as - a Balsam Job onto those resources. Note that customizing the Globus transfer - of the ``balsam_forces.yaml`` file is necessary + to check out resources (a ``BatchJob``) at a Balsam site, and submits libEnsemble as + a Balsam Job onto those resources. If transferring statfiles back to your + personal machine, it also waits until they are all returned and cancels + the remote ``BatchJob``. Configuring: + Every field in UPPER_CASE can be adjusted. ``BALSAM_SITE``, ``PROJECT``, + and ``QUEUE`` among others will probably need adjusting. ``LIBE_NODES`` and ``LIBE_RANKS`` + specify a subset of resources specifically for libEnsemble out of ``BATCH_NUM_NODES``. + If the ``forces.from_yaml()`` path in the calling script wasn't adjusted, + then ``TRANSFER_CONFIG_FILE`` can be enabled, with ``INPUT_FILE`` set to point + to the ``balsam_forces.yaml`` configuration file on your local machine. -### Running libEnsemble +### Running libEnsemble locally -Then to run with local comms (multiprocessing) with one manager and `N` workers: +First make sure that all Balsam apps are synced with the Balsam service: + + python define_apps.py + +Then run libEnsemble with multiprocessing comms, with one manager and `N` workers: python run_libe_forces_balsam.py --comms local --nworkers N -To run with MPI comms using one manager and `N-1` workers: +Or, run with MPI comms using one manager and `N-1` workers: mpirun -np N python run_libe_forces.py -**This run libEnsemble itself in-place, with only Forces submitted to a Balsam site.** +Many libEnsemble parameters can be adjusted in `balsam_forces.yaml`. -To run both libEnsemble and the Forces app on a Balsam site, use: +To remove output before the next run, use: - python submit_libe_forces_balsam.py + ./cleanup.sh -Application parameters can be adjusted in `balsam_forces.yaml`. +**This runs libEnsemble itself in-place, with only Forces submitted to a Balsam site.** -Note that each function and path must be accessible and/or importable on the -remote machine. Absolute paths are recommended. -**This runs libEnsemble itself in-place, with only forces submitted to a Balsam site.** +### (Optional) Running libEnsemble remotely -To remove output before the next run, use: +The previous instructions for running libEnsemble are understandably insufficient +if running with lots of workers or if the simulation/generation +functions are computationally expensive. - ./cleanup.sh +To run both libEnsemble and the Forces app on the compute nodes at Balsam site, use: + + python define_apps.py + python submit_libe_forces_balsam.py ### (Optional) Configuring data-transfer via Balsam and Globus -Although the raw results of forces runs are available in Balsam sites, remote or -local, this is understandably insufficient for the simulation function's capability +Although the raw results of forces runs are available in Balsam sites, +this is understandably insufficient for the simulation function's capability to evaluate results and determine the final status of an app run if it's running on another machine. @@ -154,42 +168,9 @@ to be transferred back to your local launch directory after every app run. The simulation function will wait for Balsam to transfer back a stat file, then determine the calc status based on the received output. -*To transfer files to Theta*, you will need to login to Globus and activate -the ``alcf#dtn_theta`` Managed Public Endpoint. - -### (Optional) Running libEnsemble as a Balsam app on compute nodes - -The previous instructions for running libEnsemble are understandably insufficient -if running with potentially hundreds of workers or if the simulation/generation -functions are computationally expensive. +*To transfer files to/from Theta*, you will need to login to Globus and activate +Theta's Managed Public Endpoint: -The included ``submit_libe_forces_balsam.py`` script will submit libEnsemble itself -as a Balsam Job, to be run by a Balsam site on the compute nodes. From there libEnsemble's -simulation function will behave as before, submitting forces apps to Balsam for scheduling -on the same allocation. - -Since Balsam's API can initiate allocations for a given Balsam site remotely, -``submit_libe_forces_balsam.py`` behaves like a batch submission script except -it can be run from *anywhere* and still initiate a session on Theta. This does mean -that any input files still need to be transferred by Globus to be accessible by -libEnsemble running on the compute nodes. Customize the ``input_file`` dictionary -according to Balsam's Globus specifications to do this (see the previous section). - -The following parameters can be adjusted at the top of this script: - - SIM_MAX = 16 # make sure matches in balsam_forces.yaml - BATCH_NUM_NODES = 5 - BATCH_WALL_CLOCK_TIME = 60 - PROJECT = "CSC250STMS07" - QUEUE = "debug-flat-quad" - - # libE Job Parameters - Will use above resources - LIBE_NODES = 1 - LIBE_RANKS = 5 - -**Adjust each of the literal sites, directories, paths and other attributes** -in each of the ``ApplicationDefinition`` instances. If transferring statfiles, -this script can wait for a number of statfiles equal to ``sim_max`` to be returned, -then cancel the remote BatchJob. For this script, set ``TRANSFER_STATFILES`` to ``True.`` -The calling script will also need to be updated to contain the correct Globus endpoint -and destination directory for the transfers. +- Login to Globus, click "Endpoints" on the left. +- Search for ``alcf#dtn_theta``, click on the result. +- On the right, click "Activate", then "Continue". Authenticate with ALCF. diff --git a/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py index b0e612bb4..72f2972a8 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py @@ -19,21 +19,22 @@ PROJECT = "CSC250STMS07" QUEUE = "debug-flat-quad" -# libEnsemble Job Parameters - Will use above resources +# libEnsemble Job Parameters - A subset of above resources dedicated to libEnsemble LIBE_NODES = 1 LIBE_RANKS = 5 # Parameter file for calling script. Must be transferred to Balsam site. # globus_endpoint_key:/path/to/file # globus_endpoint_key specified in BALSAM_SITE's settings.yml -input_file = ( +TRANSFER_CONFIG_FILE = False +INPUT_FILE = ( "jln_laptop:/Users/jnavarro/Desktop/libensemble" + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml" ) # Transfer forces.stat files back to the following local destination? # If True, this script cancels remote allocation once SIM_MAX statfiles transferred -TRANSFER_STATFILES = True +TRANSFER_STATFILES = False TRANSFER_DESTINATION = "./ensemble" SIM_MAX = 16 # must match balsam_forces.yaml @@ -42,13 +43,18 @@ RemoteLibensembleApp = apps["RemoteLibensembleApp"] RemoteLibensembleApp.resolve_site_id() +if TRANSFER_CONFIG_FILE: + transfers = {"input_file": INPUT_FILE} +else: + transfers = {} + # Submit the libEnsemble app as a Job to the Balsam service. # It will wait for a compatible, running BatchJob session (remote allocation) libe_job = RemoteLibensembleApp.submit( workdir="libe_workflow", num_nodes=LIBE_NODES, ranks_per_node=LIBE_RANKS, - transfers={"input_file": input_file}, + transfers=transfers, ) print("libEnsemble App retrieved and submitted as Job to Balsam service.") From 46edbf0bd0b63f2c8cdfd1244e313b94606d04f7 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 18 Apr 2022 11:37:19 -0500 Subject: [PATCH 78/93] fix executor init logic to resolve Balsam2 having new name on pypi --- libensemble/executors/__init__.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/libensemble/executors/__init__.py b/libensemble/executors/__init__.py index 9a332c542..b95e14d7d 100644 --- a/libensemble/executors/__init__.py +++ b/libensemble/executors/__init__.py @@ -1,17 +1,18 @@ from libensemble.executors.executor import Executor from libensemble.executors.mpi_executor import MPIExecutor -import os import pkg_resources try: - if pkg_resources.get_distribution('balsam-flow'): - if 'BALSAM_DB_PATH' in os.environ: - from libensemble.executors.legacy_balsam_executor import LegacyBalsamMPIExecutor - else: - from libensemble.executors.balsam_executor import BalsamExecutor + if pkg_resources.get_distribution("balsam-flow"): # Balsam up through 0.5.0 + from libensemble.executors.legacy_balsam_executor import LegacyBalsamMPIExecutor + if pkg_resources.get_distribution("balsam"): # Balsam 0.7.0 onward (Balsam 2) + from libensemble.executors.balsam_executor import BalsamExecutor -except (ModuleNotFoundError, pkg_resources.DistributionNotFound): # One version of Balsam installed, but not the other +except ( + ModuleNotFoundError, + pkg_resources.DistributionNotFound, +): # One version of Balsam installed, but not the other pass -__all__ = ['LegacyBalsamMPIExecutor', 'Executor', 'MPIExecutor', 'BalsamExecutor'] +__all__ = ["LegacyBalsamMPIExecutor", "Executor", "MPIExecutor", "BalsamExecutor"] From 5993a9c747bc2d1c0229ed6c7fb16ebd36fbfb1b Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 18 Apr 2022 13:06:50 -0500 Subject: [PATCH 79/93] primarily a rename to clarify, and some clarifying comments --- libensemble/tests/scaling_tests/balsam_forces/define_apps.py | 2 ++ ..._libe_forces_balsam.py => submit_libe_forces_remotely.py} | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) rename libensemble/tests/scaling_tests/balsam_forces/{submit_libe_forces_balsam.py => submit_libe_forces_remotely.py} (95%) diff --git a/libensemble/tests/scaling_tests/balsam_forces/define_apps.py b/libensemble/tests/scaling_tests/balsam_forces/define_apps.py index fdda1314d..0138d534d 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/define_apps.py +++ b/libensemble/tests/scaling_tests/balsam_forces/define_apps.py @@ -16,6 +16,8 @@ Unless changes are made to these Apps, this should only need to be run once to register each of these apps with the Balsam service. + +If not running libEnsemble remotely, feel free to comment-out ``RemoteLibensembleApp.sync()`` """ diff --git a/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_remotely.py similarity index 95% rename from libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py rename to libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_remotely.py index 72f2972a8..b236cf51a 100644 --- a/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_remotely.py @@ -8,7 +8,10 @@ that used legacy Balsam commands, except it uses the Balsam API to submit jobs to the scheduler. It can also be run from anywhere and still submit jobs to the same machine. It loads, parameterizes, and submits the LibensembleApp for -execution. Use this script to run libEnsemble as a Balsam Job on the compute nodes. +execution. Use this script to run libEnsemble as a Balsam Job on compute nodes. + +If running libEnsemble on a laptop, this script is not needed. Just run the +corresponding libEnsemble calling script as normal. """ BALSAM_SITE = "jln_theta" From b1a4881aa1a4276b97901f659c9e479888bbb7aa Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 18 Apr 2022 15:48:12 -0500 Subject: [PATCH 80/93] refactor platforms_index for new Balsam diagram, comparing Balsam versions, links to each. Add new diagram in autodocs for new balsam executor --- docs/platforms/platforms_index.rst | 36 ++++++++++++++++++------ libensemble/executors/balsam_executor.py | 5 ++++ 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index b21197ab2..1de4d76a1 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -88,22 +88,42 @@ capability (eg. ``aprun``, ``jsrun``). MPI applications can only be submitted fr of these systems include: Summit, Sierra and Theta. There are two ways of running libEnsemble on these kind of systems. The first, and simplest, -is to run libEnsemble on the launch nodes. This is often sufficient if the worker's sim or -gen scripts are not doing too much work (other than launching applications). This approach +is to run libEnsemble on the launch nodes. This is often sufficient if the worker's simulation +or generation functions are not doing much work (other than launching applications). This approach is inherently centralized. The entire node allocation is available for the worker-launched tasks. -To run libEnsemble on the compute nodes of these systems requires an alternative Executor, -such as :doc:`Balsam<../executor/balsam_2_executor>`, which maintains a separate service -and launches tasks submitted by workers. Running libEnsemble on the compute -nodes is potentially more scalable and will better manage ``sim_f`` and ``gen_f`` functions -that contain considerable computational work or I/O. +However, running libEnsemble on the compute nodes is potentially more scalable and +will better manage simulation and generation functions that contain considerable +computational work or I/O. Therefore the second option is to use proxy task-execution +services like Balsam_. - .. image:: ../images/centralized_new_detailed_balsam.png +Balsam - Externally managed applications +---------------------------------------- + +Running libEnsemble on the compute nodes while still submitting additional applications +requires alternative Executors that connect to external services like Balsam_. Balsam +can take tasks submitted by workers and execute them on the remaining compute nodes, +or if using Balsam 2, *to entirely different systems*. + + .. figure:: ../images/centralized_new_detailed_balsam.png :alt: central_balsam :scale: 30 :align: center + Single-System: libEnsemble + LegacyBalsamMPIExecutor + + .. figure:: ../images/balsam2.png + :alt: balsam2 + :scale: 40 + :align: center + + (New) Multi-System: libEnsemble + BalsamExecutor + +As of v0.8.0+dev, libEnsemble supports both "legacy" Balsam via the +:doc:`LegacyBalsamMPIExecutor<../executor/legacy_balsam_executor>` +and Balsam 2 via the :doc:`BalsamExecutor<../executor/balsam_2_executor>`. + Submission scripts for running on launch/MOM nodes and for using Balsam, can be be found in the :doc:`examples`. diff --git a/libensemble/executors/balsam_executor.py b/libensemble/executors/balsam_executor.py index 903448021..1df98a464 100644 --- a/libensemble/executors/balsam_executor.py +++ b/libensemble/executors/balsam_executor.py @@ -2,6 +2,11 @@ This module launches and controls the running of tasks with Balsam_, and most notably can submit tasks from any machine, to any machine running a Balsam site_. +.. image:: ../images/balsam2.png + :alt: central_balsam + :scale: 40 + :align: center + At this time, access to Balsam is limited to those with valid organizational logins authenticated through Globus_. From 5f64174c86e322c0a840102dad6ead526f74ffe1 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 19 Apr 2022 12:17:40 -0500 Subject: [PATCH 81/93] refactor forces tests and many common components into separate directories --- .../forces_with_executor/build_forces.sh | 2 +- .../tutorials/forces_with_executor/cleanup.sh | 1 - .../tutorials/forces_with_executor/forces.c | 2 +- .../forces_simf_simple.py | 1 + .../run_libe_forces_simple.py | 1 + .../balsam_forces/build_forces.sh | 39 ------------------- .../balsam_forces/balsam_forces.yaml | 0 .../{ => forces}/balsam_forces/cleanup.sh | 0 .../{ => forces}/balsam_forces/define_apps.py | 0 .../{ => forces}/balsam_forces/forces_simf.py | 0 .../{ => forces}/balsam_forces/readme.md | 0 .../balsam_forces/run_libe_forces_balsam.py | 0 .../submit_libe_forces_remotely.py | 0 .../scaling_tests/forces/balsam_local.sh | 36 ----------------- .../forces/{ => forces_adv}/cleanup.sh | 0 .../forces/{ => forces_adv}/clone.sh | 0 .../forces/{ => forces_adv}/forces.yaml | 0 .../forces/{ => forces_adv}/forces_simf.py | 0 .../forces/{ => forces_adv}/forces_support.py | 0 .../mini_forces/build_forces.sh | 0 .../mini_forces/mini_forces.c | 0 .../mini_forces/mini_forces_AoS.c | 0 .../forces/{ => forces_adv}/readme.md | 0 .../{ => forces_adv}/run_libe_forces.py | 0 .../run_libe_forces_from_yaml.py | 0 .../{ => forces_adv}/summit_submit_mproc.sh | 0 .../{ => forces_adv}/theta_submit_balsam.sh | 0 .../{ => forces_adv}/theta_submit_mproc.sh | 0 .../forces/{ => forces_app}/build_forces.sh | 0 .../forces/{ => forces_app}/forces.c | 0 .../forces_simple/forces_simf_simple.py | 0 .../forces_simple/run_libe_forces_simple.py | 4 +- .../{ => forces}/funcx_forces/cleanup.sh | 0 .../{ => forces}/funcx_forces/forces_simf.py | 0 .../funcx_forces/funcx_forces.yaml | 0 .../{ => forces}/funcx_forces/readme.md | 0 .../funcx_forces/run_libe_forces_funcx.py | 0 .../funcx_forces/build_forces.sh | 39 ------------------- 38 files changed, 6 insertions(+), 119 deletions(-) delete mode 100755 examples/tutorials/forces_with_executor/cleanup.sh create mode 120000 examples/tutorials/forces_with_executor/forces_simf_simple.py create mode 120000 examples/tutorials/forces_with_executor/run_libe_forces_simple.py delete mode 100755 libensemble/tests/scaling_tests/balsam_forces/build_forces.sh rename libensemble/tests/scaling_tests/{ => forces}/balsam_forces/balsam_forces.yaml (100%) rename libensemble/tests/scaling_tests/{ => forces}/balsam_forces/cleanup.sh (100%) rename libensemble/tests/scaling_tests/{ => forces}/balsam_forces/define_apps.py (100%) rename libensemble/tests/scaling_tests/{ => forces}/balsam_forces/forces_simf.py (100%) rename libensemble/tests/scaling_tests/{ => forces}/balsam_forces/readme.md (100%) rename libensemble/tests/scaling_tests/{ => forces}/balsam_forces/run_libe_forces_balsam.py (100%) rename libensemble/tests/scaling_tests/{ => forces}/balsam_forces/submit_libe_forces_remotely.py (100%) delete mode 100755 libensemble/tests/scaling_tests/forces/balsam_local.sh rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/cleanup.sh (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/clone.sh (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/forces.yaml (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/forces_simf.py (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/forces_support.py (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/mini_forces/build_forces.sh (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/mini_forces/mini_forces.c (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/mini_forces/mini_forces_AoS.c (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/readme.md (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/run_libe_forces.py (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/run_libe_forces_from_yaml.py (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/summit_submit_mproc.sh (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/theta_submit_balsam.sh (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_adv}/theta_submit_mproc.sh (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_app}/build_forces.sh (100%) rename libensemble/tests/scaling_tests/forces/{ => forces_app}/forces.c (100%) rename examples/tutorials/forces_with_executor/tutorial_forces_simf.py => libensemble/tests/scaling_tests/forces/forces_simple/forces_simf_simple.py (100%) rename examples/tutorials/forces_with_executor/tutorial_run_libe_forces.py => libensemble/tests/scaling_tests/forces/forces_simple/run_libe_forces_simple.py (93%) rename libensemble/tests/scaling_tests/{ => forces}/funcx_forces/cleanup.sh (100%) rename libensemble/tests/scaling_tests/{ => forces}/funcx_forces/forces_simf.py (100%) rename libensemble/tests/scaling_tests/{ => forces}/funcx_forces/funcx_forces.yaml (100%) rename libensemble/tests/scaling_tests/{ => forces}/funcx_forces/readme.md (100%) rename libensemble/tests/scaling_tests/{ => forces}/funcx_forces/run_libe_forces_funcx.py (100%) delete mode 100755 libensemble/tests/scaling_tests/funcx_forces/build_forces.sh diff --git a/examples/tutorials/forces_with_executor/build_forces.sh b/examples/tutorials/forces_with_executor/build_forces.sh index 713b765de..b2f0ad557 120000 --- a/examples/tutorials/forces_with_executor/build_forces.sh +++ b/examples/tutorials/forces_with_executor/build_forces.sh @@ -1 +1 @@ -../../../libensemble/tests/scaling_tests/forces/build_forces.sh \ No newline at end of file +../../../libensemble/tests/scaling_tests/forces/forces_app/build_forces.sh \ No newline at end of file diff --git a/examples/tutorials/forces_with_executor/cleanup.sh b/examples/tutorials/forces_with_executor/cleanup.sh deleted file mode 100755 index 6c02df691..000000000 --- a/examples/tutorials/forces_with_executor/cleanup.sh +++ /dev/null @@ -1 +0,0 @@ -rm *.stat libE_stats.txt *.err *.out forces.x ensemble.log diff --git a/examples/tutorials/forces_with_executor/forces.c b/examples/tutorials/forces_with_executor/forces.c index 944d7b790..6179f7edf 120000 --- a/examples/tutorials/forces_with_executor/forces.c +++ b/examples/tutorials/forces_with_executor/forces.c @@ -1 +1 @@ -../../../libensemble/tests/scaling_tests/forces/forces.c \ No newline at end of file +../../../libensemble/tests/scaling_tests/forces/forces_app/forces.c \ No newline at end of file diff --git a/examples/tutorials/forces_with_executor/forces_simf_simple.py b/examples/tutorials/forces_with_executor/forces_simf_simple.py new file mode 120000 index 000000000..08e54c77e --- /dev/null +++ b/examples/tutorials/forces_with_executor/forces_simf_simple.py @@ -0,0 +1 @@ +../../../libensemble/tests/scaling_tests/forces/forces_simple/forces_simf_simple.py \ No newline at end of file diff --git a/examples/tutorials/forces_with_executor/run_libe_forces_simple.py b/examples/tutorials/forces_with_executor/run_libe_forces_simple.py new file mode 120000 index 000000000..8ce57fdd3 --- /dev/null +++ b/examples/tutorials/forces_with_executor/run_libe_forces_simple.py @@ -0,0 +1 @@ +../../../libensemble/tests/scaling_tests/forces/forces_simple/run_libe_forces_simple.py \ No newline at end of file diff --git a/libensemble/tests/scaling_tests/balsam_forces/build_forces.sh b/libensemble/tests/scaling_tests/balsam_forces/build_forces.sh deleted file mode 100755 index 20b106ba4..000000000 --- a/libensemble/tests/scaling_tests/balsam_forces/build_forces.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -# Building flat MPI - -# GCC -mpicc -O3 -o forces.x ../forces/forces.c -lm - -# Intel -# mpiicc -O3 -o forces.x forces.c - -# Cray -# cc -O3 -o forces.x forces.c - -# ---------------------------------------------- - -# Building with OpenMP for CPU - -# GCC -# mpicc -O3 -fopenmp -o forces.x forces.c -lm - -# Intel -# mpiicc -O3 -qopenmp -o forces.x forces.c - -# Cray / Intel (for CCE OpenMP is recognized by default) -# cc -O3 -qopenmp -o forces.x forces.c - -# xl -# xlc_r -O3 -qsmp=omp -o forces.x forces.c - -# ---------------------------------------------- - -# Building with OpenMP for target device (e.g. GPU) -# Need to toggle to OpenMP target directive in forces.c. - -# xl -# xlc_r -O3 -qsmp=omp -qoffload -o forces.x forces.c - -# IRIS node (Intel Gen9 GPU) -# env MPICH_CC=icx mpigcc -g -fiopenmp -fopenmp-targets=spir64 -o forces.x forces.c diff --git a/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml b/libensemble/tests/scaling_tests/forces/balsam_forces/balsam_forces.yaml similarity index 100% rename from libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml rename to libensemble/tests/scaling_tests/forces/balsam_forces/balsam_forces.yaml diff --git a/libensemble/tests/scaling_tests/balsam_forces/cleanup.sh b/libensemble/tests/scaling_tests/forces/balsam_forces/cleanup.sh similarity index 100% rename from libensemble/tests/scaling_tests/balsam_forces/cleanup.sh rename to libensemble/tests/scaling_tests/forces/balsam_forces/cleanup.sh diff --git a/libensemble/tests/scaling_tests/balsam_forces/define_apps.py b/libensemble/tests/scaling_tests/forces/balsam_forces/define_apps.py similarity index 100% rename from libensemble/tests/scaling_tests/balsam_forces/define_apps.py rename to libensemble/tests/scaling_tests/forces/balsam_forces/define_apps.py diff --git a/libensemble/tests/scaling_tests/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py similarity index 100% rename from libensemble/tests/scaling_tests/balsam_forces/forces_simf.py rename to libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py diff --git a/libensemble/tests/scaling_tests/balsam_forces/readme.md b/libensemble/tests/scaling_tests/forces/balsam_forces/readme.md similarity index 100% rename from libensemble/tests/scaling_tests/balsam_forces/readme.md rename to libensemble/tests/scaling_tests/forces/balsam_forces/readme.md diff --git a/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py similarity index 100% rename from libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py rename to libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py diff --git a/libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_remotely.py b/libensemble/tests/scaling_tests/forces/balsam_forces/submit_libe_forces_remotely.py similarity index 100% rename from libensemble/tests/scaling_tests/balsam_forces/submit_libe_forces_remotely.py rename to libensemble/tests/scaling_tests/forces/balsam_forces/submit_libe_forces_remotely.py diff --git a/libensemble/tests/scaling_tests/forces/balsam_local.sh b/libensemble/tests/scaling_tests/forces/balsam_local.sh deleted file mode 100755 index 9e447b743..000000000 --- a/libensemble/tests/scaling_tests/forces/balsam_local.sh +++ /dev/null @@ -1,36 +0,0 @@ -# Script for running with Balsam on a local system. - -# You need to have followed the instructions to install balsam and set-up/activate a database. -# https://github.com/balsam-alcf/balsam - -# The running jobs can be seen inside the setup database dir /data/libe_workflow/ - -# Name of calling script -export EXE=run_libe_forces.py - -# Number of workers. -export NUM_WORKERS=2 - -# Name of working directory where Balsam places running jobs/output (inside the database directory) -export WORKFLOW_NAME=libe_workflow - -export SCRIPT_ARGS=$NUM_WORKERS - -export LIBE_WALLCLOCK=5 # Balsam timeout in mins - -# Add calling script to Balsam database as app and job. -export THIS_DIR=$PWD -export SCRIPT_BASENAME=${EXE%.*} - -# Delete any apps/jobs in Balsam -balsam rm apps --all --force -balsam rm jobs --all --force - -# Register your libEnsemble calling script as an app. -balsam app --name $SCRIPT_BASENAME.app --exec $EXE --desc "Run $SCRIPT_BASENAME" - -# Register as a job -balsam job --name job_$SCRIPT_BASENAME --workflow $WORKFLOW_NAME --application $SCRIPT_BASENAME.app --args $SCRIPT_ARGS --wall-time-minutes $LIBE_WALLCLOCK --num-nodes 1 --ranks-per-node $((NUM_WORKERS+1)) --url-out="local:/$THIS_DIR" --stage-out-files="*.out *.txt *.log" --url-in="local:/$THIS_DIR/*" --yes - -#Run job -balsam launcher --consume-all --job-mode=mpi --num-transition-threads=1 diff --git a/libensemble/tests/scaling_tests/forces/cleanup.sh b/libensemble/tests/scaling_tests/forces/forces_adv/cleanup.sh similarity index 100% rename from libensemble/tests/scaling_tests/forces/cleanup.sh rename to libensemble/tests/scaling_tests/forces/forces_adv/cleanup.sh diff --git a/libensemble/tests/scaling_tests/forces/clone.sh b/libensemble/tests/scaling_tests/forces/forces_adv/clone.sh similarity index 100% rename from libensemble/tests/scaling_tests/forces/clone.sh rename to libensemble/tests/scaling_tests/forces/forces_adv/clone.sh diff --git a/libensemble/tests/scaling_tests/forces/forces.yaml b/libensemble/tests/scaling_tests/forces/forces_adv/forces.yaml similarity index 100% rename from libensemble/tests/scaling_tests/forces/forces.yaml rename to libensemble/tests/scaling_tests/forces/forces_adv/forces.yaml diff --git a/libensemble/tests/scaling_tests/forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/forces_adv/forces_simf.py similarity index 100% rename from libensemble/tests/scaling_tests/forces/forces_simf.py rename to libensemble/tests/scaling_tests/forces/forces_adv/forces_simf.py diff --git a/libensemble/tests/scaling_tests/forces/forces_support.py b/libensemble/tests/scaling_tests/forces/forces_adv/forces_support.py similarity index 100% rename from libensemble/tests/scaling_tests/forces/forces_support.py rename to libensemble/tests/scaling_tests/forces/forces_adv/forces_support.py diff --git a/libensemble/tests/scaling_tests/forces/mini_forces/build_forces.sh b/libensemble/tests/scaling_tests/forces/forces_adv/mini_forces/build_forces.sh similarity index 100% rename from libensemble/tests/scaling_tests/forces/mini_forces/build_forces.sh rename to libensemble/tests/scaling_tests/forces/forces_adv/mini_forces/build_forces.sh diff --git a/libensemble/tests/scaling_tests/forces/mini_forces/mini_forces.c b/libensemble/tests/scaling_tests/forces/forces_adv/mini_forces/mini_forces.c similarity index 100% rename from libensemble/tests/scaling_tests/forces/mini_forces/mini_forces.c rename to libensemble/tests/scaling_tests/forces/forces_adv/mini_forces/mini_forces.c diff --git a/libensemble/tests/scaling_tests/forces/mini_forces/mini_forces_AoS.c b/libensemble/tests/scaling_tests/forces/forces_adv/mini_forces/mini_forces_AoS.c similarity index 100% rename from libensemble/tests/scaling_tests/forces/mini_forces/mini_forces_AoS.c rename to libensemble/tests/scaling_tests/forces/forces_adv/mini_forces/mini_forces_AoS.c diff --git a/libensemble/tests/scaling_tests/forces/readme.md b/libensemble/tests/scaling_tests/forces/forces_adv/readme.md similarity index 100% rename from libensemble/tests/scaling_tests/forces/readme.md rename to libensemble/tests/scaling_tests/forces/forces_adv/readme.md diff --git a/libensemble/tests/scaling_tests/forces/run_libe_forces.py b/libensemble/tests/scaling_tests/forces/forces_adv/run_libe_forces.py similarity index 100% rename from libensemble/tests/scaling_tests/forces/run_libe_forces.py rename to libensemble/tests/scaling_tests/forces/forces_adv/run_libe_forces.py diff --git a/libensemble/tests/scaling_tests/forces/run_libe_forces_from_yaml.py b/libensemble/tests/scaling_tests/forces/forces_adv/run_libe_forces_from_yaml.py similarity index 100% rename from libensemble/tests/scaling_tests/forces/run_libe_forces_from_yaml.py rename to libensemble/tests/scaling_tests/forces/forces_adv/run_libe_forces_from_yaml.py diff --git a/libensemble/tests/scaling_tests/forces/summit_submit_mproc.sh b/libensemble/tests/scaling_tests/forces/forces_adv/summit_submit_mproc.sh similarity index 100% rename from libensemble/tests/scaling_tests/forces/summit_submit_mproc.sh rename to libensemble/tests/scaling_tests/forces/forces_adv/summit_submit_mproc.sh diff --git a/libensemble/tests/scaling_tests/forces/theta_submit_balsam.sh b/libensemble/tests/scaling_tests/forces/forces_adv/theta_submit_balsam.sh similarity index 100% rename from libensemble/tests/scaling_tests/forces/theta_submit_balsam.sh rename to libensemble/tests/scaling_tests/forces/forces_adv/theta_submit_balsam.sh diff --git a/libensemble/tests/scaling_tests/forces/theta_submit_mproc.sh b/libensemble/tests/scaling_tests/forces/forces_adv/theta_submit_mproc.sh similarity index 100% rename from libensemble/tests/scaling_tests/forces/theta_submit_mproc.sh rename to libensemble/tests/scaling_tests/forces/forces_adv/theta_submit_mproc.sh diff --git a/libensemble/tests/scaling_tests/forces/build_forces.sh b/libensemble/tests/scaling_tests/forces/forces_app/build_forces.sh similarity index 100% rename from libensemble/tests/scaling_tests/forces/build_forces.sh rename to libensemble/tests/scaling_tests/forces/forces_app/build_forces.sh diff --git a/libensemble/tests/scaling_tests/forces/forces.c b/libensemble/tests/scaling_tests/forces/forces_app/forces.c similarity index 100% rename from libensemble/tests/scaling_tests/forces/forces.c rename to libensemble/tests/scaling_tests/forces/forces_app/forces.c diff --git a/examples/tutorials/forces_with_executor/tutorial_forces_simf.py b/libensemble/tests/scaling_tests/forces/forces_simple/forces_simf_simple.py similarity index 100% rename from examples/tutorials/forces_with_executor/tutorial_forces_simf.py rename to libensemble/tests/scaling_tests/forces/forces_simple/forces_simf_simple.py diff --git a/examples/tutorials/forces_with_executor/tutorial_run_libe_forces.py b/libensemble/tests/scaling_tests/forces/forces_simple/run_libe_forces_simple.py similarity index 93% rename from examples/tutorials/forces_with_executor/tutorial_run_libe_forces.py rename to libensemble/tests/scaling_tests/forces/forces_simple/run_libe_forces_simple.py index da0b7b00d..d92b69a06 100644 --- a/examples/tutorials/forces_with_executor/tutorial_run_libe_forces.py +++ b/libensemble/tests/scaling_tests/forces/forces_simple/run_libe_forces_simple.py @@ -16,10 +16,10 @@ # Normally would be pre-compiled if not os.path.isfile("forces.x"): - if os.path.isfile("build_forces.sh"): + if os.path.isfile("../forces_app/build_forces.sh"): import subprocess - subprocess.check_call(["./build_forces.sh"]) + subprocess.check_call(["../forces_app/build_forces.sh"]) # Register simulation executable with executor sim_app = os.path.join(os.getcwd(), "forces.x") diff --git a/libensemble/tests/scaling_tests/funcx_forces/cleanup.sh b/libensemble/tests/scaling_tests/forces/funcx_forces/cleanup.sh similarity index 100% rename from libensemble/tests/scaling_tests/funcx_forces/cleanup.sh rename to libensemble/tests/scaling_tests/forces/funcx_forces/cleanup.sh diff --git a/libensemble/tests/scaling_tests/funcx_forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/funcx_forces/forces_simf.py similarity index 100% rename from libensemble/tests/scaling_tests/funcx_forces/forces_simf.py rename to libensemble/tests/scaling_tests/forces/funcx_forces/forces_simf.py diff --git a/libensemble/tests/scaling_tests/funcx_forces/funcx_forces.yaml b/libensemble/tests/scaling_tests/forces/funcx_forces/funcx_forces.yaml similarity index 100% rename from libensemble/tests/scaling_tests/funcx_forces/funcx_forces.yaml rename to libensemble/tests/scaling_tests/forces/funcx_forces/funcx_forces.yaml diff --git a/libensemble/tests/scaling_tests/funcx_forces/readme.md b/libensemble/tests/scaling_tests/forces/funcx_forces/readme.md similarity index 100% rename from libensemble/tests/scaling_tests/funcx_forces/readme.md rename to libensemble/tests/scaling_tests/forces/funcx_forces/readme.md diff --git a/libensemble/tests/scaling_tests/funcx_forces/run_libe_forces_funcx.py b/libensemble/tests/scaling_tests/forces/funcx_forces/run_libe_forces_funcx.py similarity index 100% rename from libensemble/tests/scaling_tests/funcx_forces/run_libe_forces_funcx.py rename to libensemble/tests/scaling_tests/forces/funcx_forces/run_libe_forces_funcx.py diff --git a/libensemble/tests/scaling_tests/funcx_forces/build_forces.sh b/libensemble/tests/scaling_tests/funcx_forces/build_forces.sh deleted file mode 100755 index 20b106ba4..000000000 --- a/libensemble/tests/scaling_tests/funcx_forces/build_forces.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -# Building flat MPI - -# GCC -mpicc -O3 -o forces.x ../forces/forces.c -lm - -# Intel -# mpiicc -O3 -o forces.x forces.c - -# Cray -# cc -O3 -o forces.x forces.c - -# ---------------------------------------------- - -# Building with OpenMP for CPU - -# GCC -# mpicc -O3 -fopenmp -o forces.x forces.c -lm - -# Intel -# mpiicc -O3 -qopenmp -o forces.x forces.c - -# Cray / Intel (for CCE OpenMP is recognized by default) -# cc -O3 -qopenmp -o forces.x forces.c - -# xl -# xlc_r -O3 -qsmp=omp -o forces.x forces.c - -# ---------------------------------------------- - -# Building with OpenMP for target device (e.g. GPU) -# Need to toggle to OpenMP target directive in forces.c. - -# xl -# xlc_r -O3 -qsmp=omp -qoffload -o forces.x forces.c - -# IRIS node (Intel Gen9 GPU) -# env MPICH_CC=icx mpigcc -g -fiopenmp -fopenmp-targets=spir64 -o forces.x forces.c From 44f64670a98739ea067f4bbb0246da6682f7c007 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 19 Apr 2022 14:39:39 -0500 Subject: [PATCH 82/93] move miniforces, refactor balsam forces test to be simplified based on recent tutorial refactoring, remotes balsam_forces.yaml --- .../forces/balsam_forces/balsam_forces.yaml | 38 ----- .../forces/balsam_forces/define_apps.py | 2 +- .../forces/balsam_forces/forces_simf.py | 133 +++++------------- .../balsam_forces/run_libe_forces_balsam.py | 47 +++++-- .../mini_forces/build_forces.sh | 0 .../mini_forces/mini_forces.c | 0 .../mini_forces/mini_forces_AoS.c | 0 7 files changed, 68 insertions(+), 152 deletions(-) delete mode 100644 libensemble/tests/scaling_tests/forces/balsam_forces/balsam_forces.yaml rename libensemble/tests/scaling_tests/forces/{forces_adv => forces_app}/mini_forces/build_forces.sh (100%) rename libensemble/tests/scaling_tests/forces/{forces_adv => forces_app}/mini_forces/mini_forces.c (100%) rename libensemble/tests/scaling_tests/forces/{forces_adv => forces_app}/mini_forces/mini_forces_AoS.c (100%) diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/balsam_forces.yaml b/libensemble/tests/scaling_tests/forces/balsam_forces/balsam_forces.yaml deleted file mode 100644 index 7183c7a7b..000000000 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/balsam_forces.yaml +++ /dev/null @@ -1,38 +0,0 @@ -libE_specs: - save_every_k_gens: 1000 - sim_dirs_make: True - exit_criteria: - sim_max: 16 - -sim_specs: - function: libensemble.tests.scaling_tests.balsam_forces.forces_simf.run_forces_balsam - inputs: - - x - outputs: - energy: - type: float - user: - keys: - - seed - sim_particles: 1.e+3 - sim_timesteps: 5 - particle_variance: 0.2 - kill_rate: 0.5 - -gen_specs: - function: libensemble.gen_funcs.sampling.uniform_random_sample - outputs: - x: - type: float - size: 1 - user: - gen_batch_size: 1000 - -alloc_specs: - function: libensemble.alloc_funcs.give_sim_work_first.give_sim_work_first - outputs: - allocated: - type: bool - user: - batch_mode: True - num_active_gens: 1 diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/define_apps.py b/libensemble/tests/scaling_tests/forces/balsam_forces/define_apps.py index 0138d534d..50d26465c 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/define_apps.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/define_apps.py @@ -48,7 +48,7 @@ class RemoteForces(ApplicationDefinition): command_template = ( "/home/jnavarro" + "/libensemble/libensemble/tests/scaling_tests/forces/forces.x" - + " {{sim_particles}} {{sim_timesteps}} {{seed}} {{kill_rate}}" + + " {{sim_particles}} {{sim_timesteps}} {{seed}}" + " > out.txt 2>&1" ) diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py index fa927cc90..c9a7662b8 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py @@ -8,66 +8,36 @@ def run_forces_balsam(H, persis_info, sim_specs, libE_info): from libensemble.executors.executor import Executor from libensemble.message_numbers import WORKER_DONE, TASK_FAILED - def perturb(particles, seed, max_fraction): - MAX_SEED = 32767 - """Modify particle count""" - seed_fraction = seed / MAX_SEED - max_delta = particles * max_fraction - delta = seed_fraction * max_delta - delta = delta - max_delta / 2 # translate so -/+ - new_particles = particles + delta - return int(new_particles) - - def read_last_line(filepath): - """Read last line of statfile""" - try: - with open(filepath, "rb") as fh: - line = fh.readlines()[-1].decode().rstrip() - except Exception: - line = "" # In case file is empty or not yet created - return line - - calc_status = 0 # Returns to worker + calc_status = 0 + + particles = str(int(H["x"][0][0])) exctr = Executor.executor - x = H["x"] - sim_particles = sim_specs["user"]["sim_particles"] - sim_timesteps = sim_specs["user"]["sim_timesteps"] - TRANSFER_STATFILES = sim_specs["user"]["transfer"] GLOBUS_ENDPOINT = sim_specs["user"]["globus_endpoint"] GLOBUS_DEST_DIR = sim_specs["user"]["globus_dest_dir"] THIS_SCRIPT_ON_THETA = sim_specs["user"]["this_script_on_theta"] - # Get from dictionary if key exists, else return default (e.g. 0) - kill_rate = sim_specs["user"].get("kill_rate", 0) - particle_variance = sim_specs["user"].get("particle_variance", 0) - - # Composing variable names and x values to set up simulation - seed = int(np.rint(x[0][0])) - - # This is to give a random variance of work-load - sim_particles = perturb(sim_particles, seed, particle_variance) - print("seed: {} particles: {}".format(seed, sim_particles)) - args = { - "sim_particles": sim_particles, - "sim_timesteps": sim_timesteps, - "seed": seed, - "kill_rate": kill_rate, + "sim_particles": particles, + "sim_timesteps": str(10), + "seed": particles, } - workdir = "worker" + str(libE_info["workerID"]) + "_" + secrets.token_hex(nbytes=3) - forcesfile = "/forces_" + secrets.token_hex(nbytes=3) + ".stat" + + workdir = "sim" + ["libE_info"]["H_rows"] + "_worker" + str(libE_info["workerID"]) + + statfile = "forces{}.stat".format(particles) if THIS_SCRIPT_ON_THETA: - file_dest = GLOBUS_DEST_DIR + forcesfile + transfer_statfile_path = GLOBUS_DEST_DIR + statfile + local_statfile_path = ( + "../" + workdir + "/" + transfer_statfile_path.split("/")[-1] + ) else: - file_dest = os.getcwd() + forcesfile + transfer_statfile_path = os.getcwd() + statfile + local_statfile_path = transfer_statfile_path - if TRANSFER_STATFILES: - transfer = {"result": GLOBUS_ENDPOINT + ":" + file_dest} - else: - transfer = {} + transfer = {"result": GLOBUS_ENDPOINT + ":" + transfer_statfile_path} task = exctr.submit( app_name="forces", @@ -80,60 +50,21 @@ def read_last_line(filepath): workdir=workdir, ) - poll_interval = 2 # secs - print("Beginning to poll Task {}".format(task.name)) - while not task.finished: - time.sleep(poll_interval) - task.poll() - if task.state == "FAILED": - break - - if task.state in ["FINISHED", "FAILED"]: - print("Task {} exited with state {}.".format(task.name, task.state)) - if THIS_SCRIPT_ON_THETA: - statfile = "../" + workdir + "/" + file_dest.split("/")[-1] - if read_last_line(statfile) == "kill": - print( - "Warning: Task completed although marked as a bad run (kill flag set in forces.stat)" - ) - calc_status = TASK_FAILED - else: - calc_status = WORKER_DONE - print("Task completed successfully.") - - try: - data = np.loadtxt(statfile) - final_energy = data[-1] - except Exception: - final_energy = np.nan - - else: - if TRANSFER_STATFILES: - print("Waiting for Task {} statfile.".format(task.name)) - while file_dest not in [ - os.path.join(os.getcwd(), i) for i in os.listdir(".") - ]: - time.sleep(1) - - if read_last_line(file_dest) == "kill": - print( - "Warning: Task completed although marked as a bad run (kill flag set in retrieved forces.stat)" - ) - calc_status = TASK_FAILED - else: - calc_status = WORKER_DONE - print("Task completed successfully. forces.stat retrieved.") - - try: - data = np.loadtxt(file_dest) - final_energy = data[-1] - except Exception: - final_energy = np.nan - else: - calc_status = WORKER_DONE - print("Task completed.") - else: - print(task.state) + task.wait(timeout=300) + task.poll() + + print("Task {} polled. state: {}.".format(task.name, task.state)) + + while not os.path.lexists(local_statfile_path): + time.sleep(1) + + try: + data = np.loadtxt(local_statfile_path) + final_energy = data[-1] + calc_status = WORKER_DONE + except Exception: + final_energy = np.nan + calc_status = TASK_FAILED outspecs = sim_specs["out"] output = np.zeros(1, dtype=outspecs) diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py index 06e93451c..87471172b 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py @@ -2,7 +2,9 @@ import socket import numpy as np -from libensemble import Ensemble +from libensemble.libE import libE +from libensemble.gen_funcs.sampling import uniform_random_sample +from forces_simf import run_forces_balsam from libensemble.executors import BalsamExecutor from balsam.api import ApplicationDefinition @@ -12,27 +14,45 @@ THIS_SCRIPT_ON_THETA = any([i in socket.gethostname() for i in ["theta", "nid0"]]) # Use Globus to transfer output forces.stat files back -TRANSFER_STATFILES = False GLOBUS_ENDPOINT = "jln_laptop" GLOBUS_DEST_DIR = ( "/Users/jnavarro/Desktop/libensemble" + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/ensemble" ) -forces = Ensemble() -forces.from_yaml("balsam_forces.yaml") +# Parse number of workers, comms type, etc. from arguments +nworkers, is_manager, libE_specs, _ = parse_args() -forces.gen_specs["user"].update({"lb": np.array([0]), "ub": np.array([32767])}) -forces.sim_specs["user"].update( - { - "transfer": TRANSFER_STATFILES, +# State the sim_f, inputs, outputs +sim_specs = { + "sim_f": run_forces_balsam, # sim_f, imported above + "in": ["x"], # Name of input for sim_f + "out": [("energy", float)], # Name, type of output from sim_f + "user": { "globus_endpoint": GLOBUS_ENDPOINT, "globus_dest_dir": GLOBUS_DEST_DIR, "this_script_on_theta": THIS_SCRIPT_ON_THETA, - } -) + }, +} + +# State the gen_f, inputs, outputs, additional parameters +gen_specs = { + "gen_f": uniform_random_sample, # Generator function + "out": [("x", float, (1,))], # Name, type and size of data from gen_f + "user": { + "lb": np.array([1000]), # User parameters for the gen_f + "ub": np.array([3000]), + "gen_batch_size": 8, + }, +} + +# Create and work inside separate per-simulation directories +libE_specs["sim_dirs_make"] = True -forces.persis_info.add_random_streams() +# Instruct libEnsemble to exit after this many simulations +exit_criteria = {"sim_max": 8} + +persis_info = add_unique_random_streams({}, nworkers + 1) apps = ApplicationDefinition.load_by_site(BALSAM_SITE) RemoteForces = apps["RemoteForces"] @@ -49,7 +69,10 @@ project="CSC250STMS07", ) -forces.run() +# Launch libEnsemble +H, persis_info, flag = libE( + sim_specs, gen_specs, exit_criteria, persis_info=persis_info, libE_specs=libE_specs +) if not THIS_SCRIPT_ON_THETA: exctr.revoke_allocation(batch) diff --git a/libensemble/tests/scaling_tests/forces/forces_adv/mini_forces/build_forces.sh b/libensemble/tests/scaling_tests/forces/forces_app/mini_forces/build_forces.sh similarity index 100% rename from libensemble/tests/scaling_tests/forces/forces_adv/mini_forces/build_forces.sh rename to libensemble/tests/scaling_tests/forces/forces_app/mini_forces/build_forces.sh diff --git a/libensemble/tests/scaling_tests/forces/forces_adv/mini_forces/mini_forces.c b/libensemble/tests/scaling_tests/forces/forces_app/mini_forces/mini_forces.c similarity index 100% rename from libensemble/tests/scaling_tests/forces/forces_adv/mini_forces/mini_forces.c rename to libensemble/tests/scaling_tests/forces/forces_app/mini_forces/mini_forces.c diff --git a/libensemble/tests/scaling_tests/forces/forces_adv/mini_forces/mini_forces_AoS.c b/libensemble/tests/scaling_tests/forces/forces_app/mini_forces/mini_forces_AoS.c similarity index 100% rename from libensemble/tests/scaling_tests/forces/forces_adv/mini_forces/mini_forces_AoS.c rename to libensemble/tests/scaling_tests/forces/forces_app/mini_forces/mini_forces_AoS.c From a96238d88daf98ce2bacf18b1c44280256036e30 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 19 Apr 2022 17:49:55 -0500 Subject: [PATCH 83/93] flake8 --- examples/calling_scripts/run_libe_forces.py | 2 +- examples/calling_scripts/run_libe_forces_from_yaml.py | 2 +- .../tests/scaling_tests/forces/balsam_forces/forces_simf.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/calling_scripts/run_libe_forces.py b/examples/calling_scripts/run_libe_forces.py index ec19d93cb..d60d20cba 120000 --- a/examples/calling_scripts/run_libe_forces.py +++ b/examples/calling_scripts/run_libe_forces.py @@ -1 +1 @@ -../../libensemble/tests/scaling_tests/forces/run_libe_forces.py \ No newline at end of file +../../libensemble/tests/scaling_tests/forces/forces_adv/run_libe_forces.py \ No newline at end of file diff --git a/examples/calling_scripts/run_libe_forces_from_yaml.py b/examples/calling_scripts/run_libe_forces_from_yaml.py index 3f4d7b4b0..bc210f22d 120000 --- a/examples/calling_scripts/run_libe_forces_from_yaml.py +++ b/examples/calling_scripts/run_libe_forces_from_yaml.py @@ -1 +1 @@ -../../libensemble/tests/scaling_tests/forces/run_libe_forces_from_yaml.py \ No newline at end of file +../../libensemble/tests/scaling_tests/forces/forces_adv/run_libe_forces_from_yaml.py \ No newline at end of file diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py index c9a7662b8..da5687255 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py @@ -2,11 +2,11 @@ def run_forces_balsam(H, persis_info, sim_specs, libE_info): import os import time - import secrets import numpy as np from libensemble.executors.executor import Executor from libensemble.message_numbers import WORKER_DONE, TASK_FAILED + from libensemble.tools import parse_args, add_unique_random_streams calc_status = 0 From b0f408de418fa0f385d7fdd864ba40a6680f626d Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 19 Apr 2022 17:51:22 -0500 Subject: [PATCH 84/93] flake8 actually --- .../tests/scaling_tests/forces/balsam_forces/forces_simf.py | 1 - .../forces/balsam_forces/run_libe_forces_balsam.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py index da5687255..6c036717e 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py @@ -6,7 +6,6 @@ def run_forces_balsam(H, persis_info, sim_specs, libE_info): from libensemble.executors.executor import Executor from libensemble.message_numbers import WORKER_DONE, TASK_FAILED - from libensemble.tools import parse_args, add_unique_random_streams calc_status = 0 diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py index 87471172b..cf523d083 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py @@ -6,6 +6,8 @@ from libensemble.gen_funcs.sampling import uniform_random_sample from forces_simf import run_forces_balsam from libensemble.executors import BalsamExecutor +from libensemble.tools import parse_args, add_unique_random_streams + from balsam.api import ApplicationDefinition BALSAM_SITE = "jln_theta" From 0f4fdb0227e1d885552afc787a20aace64b945d9 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 20 Apr 2022 10:45:04 -0500 Subject: [PATCH 85/93] fix some paths, remove unneeded transfer, fix workdir naming --- .../forces/balsam_forces/define_apps.py | 16 +++------------- .../forces/balsam_forces/forces_simf.py | 2 +- .../balsam_forces/run_libe_forces_balsam.py | 2 +- 3 files changed, 5 insertions(+), 15 deletions(-) diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/define_apps.py b/libensemble/tests/scaling_tests/forces/balsam_forces/define_apps.py index 50d26465c..c232fdb06 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/define_apps.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/define_apps.py @@ -25,20 +25,10 @@ class RemoteLibensembleApp(ApplicationDefinition): site = "jln_theta" command_template = ( "/home/jnavarro/.conda/envs/again/bin/python /home/jnavarro" - + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/run_libe_forces_balsam.py" + + "/libensemble/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py" + " > libe_out.txt 2>&1" ) - transfers = { - "input_file": { - "required": False, - "direction": "in", - "local_path": ".", - "description": "Transfer in of balsam_forces.yaml", - "recursive": False, - } - } - print("Defined RemoteLibensembleApp Balsam ApplicationDefinition.") @@ -47,14 +37,14 @@ class RemoteForces(ApplicationDefinition): site = "jln_theta" command_template = ( "/home/jnavarro" - + "/libensemble/libensemble/tests/scaling_tests/forces/forces.x" + + "/libensemble/libensemble/tests/scaling_tests/forces/forces_app/forces.x" + " {{sim_particles}} {{sim_timesteps}} {{seed}}" + " > out.txt 2>&1" ) transfers = { "result": { - "required": False, + "required": True, "direction": "out", "local_path": "forces.stat", "description": "Forces stat file", diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py index 6c036717e..bbf75f8a0 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py @@ -23,7 +23,7 @@ def run_forces_balsam(H, persis_info, sim_specs, libE_info): "seed": particles, } - workdir = "sim" + ["libE_info"]["H_rows"] + "_worker" + str(libE_info["workerID"]) + workdir = "sim" + libE_info["H_rows"] + "_worker" + str(libE_info["workerID"]) statfile = "forces{}.stat".format(particles) diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py index cf523d083..648cba37b 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py @@ -19,7 +19,7 @@ GLOBUS_ENDPOINT = "jln_laptop" GLOBUS_DEST_DIR = ( "/Users/jnavarro/Desktop/libensemble" - + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/ensemble" + + "/libensemble/libensemble/tests/scaling_tests/forces/balsam_forces/ensemble" ) # Parse number of workers, comms type, etc. from arguments From c1dbcbfcc6fb5d013acbc7708146a63439bf35da Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 20 Apr 2022 11:25:56 -0500 Subject: [PATCH 86/93] moving imports to top of file --- .../forces/balsam_forces/forces_simf.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py index bbf75f8a0..530361624 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py @@ -1,11 +1,12 @@ -def run_forces_balsam(H, persis_info, sim_specs, libE_info): +import os +import time +import numpy as np + +from libensemble.executors.executor import Executor +from libensemble.message_numbers import WORKER_DONE, TASK_FAILED - import os - import time - import numpy as np - from libensemble.executors.executor import Executor - from libensemble.message_numbers import WORKER_DONE, TASK_FAILED +def run_forces_balsam(H, persis_info, sim_specs, libE_info): calc_status = 0 From e450e6088d9c938d284cc379974bbbaf2b46574b Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 20 Apr 2022 13:37:29 -0500 Subject: [PATCH 87/93] fix workdir concatanation again, improve executor __init__.py logic to only try to import legacy balsam executor if unsuccessful on importing new executor --- libensemble/executors/__init__.py | 14 +++++++------- .../forces/balsam_forces/forces_simf.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/libensemble/executors/__init__.py b/libensemble/executors/__init__.py index b95e14d7d..058145f37 100644 --- a/libensemble/executors/__init__.py +++ b/libensemble/executors/__init__.py @@ -4,15 +4,15 @@ import pkg_resources try: - if pkg_resources.get_distribution("balsam-flow"): # Balsam up through 0.5.0 - from libensemble.executors.legacy_balsam_executor import LegacyBalsamMPIExecutor if pkg_resources.get_distribution("balsam"): # Balsam 0.7.0 onward (Balsam 2) from libensemble.executors.balsam_executor import BalsamExecutor -except ( - ModuleNotFoundError, - pkg_resources.DistributionNotFound, -): # One version of Balsam installed, but not the other - pass +except (ModuleNotFoundError, pkg_resources.DistributionNotFound): + try: + if pkg_resources.get_distribution("balsam-flow"): # Balsam up through 0.5.0 + from libensemble.executors.legacy_balsam_executor import LegacyBalsamMPIExecutor + except (ModuleNotFoundError, pkg_resources.DistributionNotFound): + pass + __all__ = ["LegacyBalsamMPIExecutor", "Executor", "MPIExecutor", "BalsamExecutor"] diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py index 530361624..230fcd6f8 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py @@ -24,7 +24,7 @@ def run_forces_balsam(H, persis_info, sim_specs, libE_info): "seed": particles, } - workdir = "sim" + libE_info["H_rows"] + "_worker" + str(libE_info["workerID"]) + workdir = "sim" + str(libE_info["H_rows"]) + "_worker" + str(libE_info["workerID"]) statfile = "forces{}.stat".format(particles) From 8b1f00f76166a52276941bb619bba4946dc6aad9 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 20 Apr 2022 14:53:03 -0500 Subject: [PATCH 88/93] rearrange README so globus instructions are higher, remove references to transferring balsam_forces.yaml, assumes transferring statfiles throughout --- .../forces/balsam_forces/readme.md | 99 ++++++++++--------- .../balsam_forces/run_libe_forces_balsam.py | 11 ++- .../submit_libe_forces_remotely.py | 37 ++----- 3 files changed, 66 insertions(+), 81 deletions(-) diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/readme.md b/libensemble/tests/scaling_tests/forces/balsam_forces/readme.md index 79580675e..364ccf6dc 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/readme.md +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/readme.md @@ -6,7 +6,8 @@ This is a synthetic, highly configurable simulation function. Its primary use is to test libEnsemble's capability to submit application instances via the Balsam service, including to separate machines from libEnsemble's processes. This means that although this is typically a HPC scaling test, this can be run on a laptop with the `forces.x` -simulation submitted to the remote machine. +simulation submitted to the remote machine, and the resulting data-files transferred +back to the machine that runs the libEnsemble calling script. Note that this test currently requires active ALCF credentials to authenticate with the Balsam service. @@ -26,15 +27,16 @@ To run forces as a standalone executable on `N` procs: mpirun -np N ./forces.x -This application will need to be compiled on the remote machine where the sim_f will run. -See below. +**This application will need to be compiled on the remote machine:** + + cd libensemble/libensemble/tests/scaling_tests/forces/forces_app + ./build_forces.sh ### Configuring Balsam On the remote machine (in a conda or other virtual environment): - git clone https://github.com/argonne-lcf/balsam.git - cd balsam; pip install -e .; cd ..; + pip install balsam balsam login balsam site init ./my-site cd my-site; balsam site start @@ -45,6 +47,38 @@ your ALCF credentials. On any machine you've installed and logged into Balsam, you can run `balsam site ls` to list your sites and `balsam job rm --all` to remove extraneous jobs between runs. +### Configuring data-transfer via Balsam and Globus + +Although the raw results of forces runs are available in Balsam sites, +this is understandably insufficient for the simulation function's capability +to evaluate results and determine the final status of an app run if it's running +on another machine. + +Balsam can coordinate data transfers via Globus between Globus endpoints. Assuming +this test is being run on a personal device, do the following to configure Globus, +then Balsam to use Globus. + +- Login to [Globus](https://www.globus.org/) using ALCF or other approved organization credentials. +- Download and run [Globus Connect Personal](https://app.globus.org/file-manager/gcp) to register your device as a Globus endpoint. Note the initialized collection name, e.g. ``test_collection``. +- Once a Globus collection has been initialized in Globus Connect Personal, login to Globus, click "Endpoints" on the left. +- Click the collection that was created on your personal device. Copy the string after "Endpoint UUID". +- Login to the remote machine, switch to your Balsam site directory, run ``balsam site globus-login``. +- Modify ``settings.yml`` to contain a new transfer_location that matches your device, with the copied endpoint UUID. e.g. ``test_collection: globus://19036a15-570a-12f8-bef8-22060b9b458d`` +- Run ``balsam site sync`` within the site directory to save these changes. +- Locally, in the calling script (``run_libe_forces_balsam.py``), set ``GLOBUS_ENDPOINT`` to the collection name for the previously-defined transfer_location. + +This should be sufficient for ``forces.stat`` files from remote Balsam app runs +to be transferred back to your personal device after every app run. The +simulation function will wait for Balsam to transfer back a stat file, then determine +the calc status based on the received output. + +*To transfer files to/from Theta*, you will need to login to Globus and activate +Theta's Managed Public Endpoint: + +- Login to Globus, click "Endpoints" on the left. +- Search for ``alcf#dtn_theta``, click on the result. +- On the right, click "Activate", then "Continue". Authenticate with ALCF. + ### Configuring libEnsemble There are several scripts that each need to be adjusted. To explain each: @@ -65,7 +99,8 @@ There are several scripts that each need to be adjusted. To explain each: Adjust the ``site`` field in each ``ApplicationDefinition`` to match your remote Balsam site. Adjust the various paths in the ``command_template`` fields to match - your home directory and/or Python paths **on the remote machine**. + your home directory and/or Python paths **on the remote machine**. If running + libEnsemble on your personal machine, feel free comment-out ``RemoteLibensembleApp.sync()``. **Run this script each time you edit it,** since changes to each ``ApplicationDefinition`` need to be synced with the Balsam service. @@ -78,18 +113,17 @@ There are several scripts that each need to be adjusted. To explain each: registering paths to apps like with the MPI Executor, this script loads the ``RemoteForces`` app synced with the Balsam service in ``define_apps.py`` and registers it with libEnsemble's Balsam Executor. If running this - script on your personal machine, it also uses the Balsam Executor to check + script on your personal machine, it also uses the Balsam Executor to reserve out resources at a Balsam site. Configuring: - At a minimum (if not performing transfers), adjust the ``BALSAM_SITE`` field + See the Globus instructions above for setting up Globus transfers within this script. + + Adjust the ``BALSAM_SITE`` field to match your remote Balsam site, and fields in the in the ``batch = exctr.submit_allocation()`` block further down. For ``site_id``, - retrieve the corresponding field with ``balsam site ls``. If this script is being - run on a remote machine, the ``forces.from_yaml()`` path can be adjusted to point to - the ``balsam_forces.yaml`` configuration file on that machine so it doesn't have - to be transferred over. + retrieve the corresponding field with ``balsam site ls``. 3. (optional) ``submit_libe_forces_balsam.py``: @@ -99,16 +133,13 @@ There are several scripts that each need to be adjusted. To explain each: to check out resources (a ``BatchJob``) at a Balsam site, and submits libEnsemble as a Balsam Job onto those resources. If transferring statfiles back to your personal machine, it also waits until they are all returned and cancels - the remote ``BatchJob``. + the remote ``BatchJob``. *Probably only needed if running libEnsemble remotely.* Configuring: Every field in UPPER_CASE can be adjusted. ``BALSAM_SITE``, ``PROJECT``, and ``QUEUE`` among others will probably need adjusting. ``LIBE_NODES`` and ``LIBE_RANKS`` specify a subset of resources specifically for libEnsemble out of ``BATCH_NUM_NODES``. - If the ``forces.from_yaml()`` path in the calling script wasn't adjusted, - then ``TRANSFER_CONFIG_FILE`` can be enabled, with ``INPUT_FILE`` set to point - to the ``balsam_forces.yaml`` configuration file on your local machine. ### Running libEnsemble locally @@ -122,9 +153,8 @@ Then run libEnsemble with multiprocessing comms, with one manager and `N` worker Or, run with MPI comms using one manager and `N-1` workers: - mpirun -np N python run_libe_forces.py + mpirun -np N python run_libe_forces_balsam.py -Many libEnsemble parameters can be adjusted in `balsam_forces.yaml`. To remove output before the next run, use: @@ -143,34 +173,5 @@ To run both libEnsemble and the Forces app on the compute nodes at Balsam site, python define_apps.py python submit_libe_forces_balsam.py -### (Optional) Configuring data-transfer via Balsam and Globus - -Although the raw results of forces runs are available in Balsam sites, -this is understandably insufficient for the simulation function's capability -to evaluate results and determine the final status of an app run if it's running -on another machine. - -Balsam can coordinate data transfers via Globus between Globus endpoints. Assuming -this test is being run on a personal device, do the following to configure Globus, -then Balsam to use Globus. - -- Login to [Globus](https://www.globus.org/) using ALCF or other approved organization credentials. -- Download and run [Globus Connect Personal](https://app.globus.org/file-manager/gcp) to register your device as a Globus endpoint. -- Once a Globus collection has been initialized in Globus Connect Personal, login to Globus, click "Endpoints" on the left. -- Click the collection that was created on your personal device. Copy the string after "Endpoint UUID". -- Login to the remote machine, switch to your Balsam site directory, run ``balsam site globus-login``. -- Modify ``settings.yml`` to contain a new transfer_location that matches your device, with the copied endpoint UUID. -- Run ``balsam site sync`` within the site directory to save these changes. -- Locally, in the calling script, enable ``TRANSFER_STATFILES`` and set ``GLOBUS_ENDPOINT`` to the key for the previously-defined transfer_location - -This should be sufficient for ``forces.stat`` files from remote Balsam app runs -to be transferred back to your local launch directory after every app run. The -simulation function will wait for Balsam to transfer back a stat file, then determine -the calc status based on the received output. - -*To transfer files to/from Theta*, you will need to login to Globus and activate -Theta's Managed Public Endpoint: - -- Login to Globus, click "Endpoints" on the left. -- Search for ``alcf#dtn_theta``, click on the result. -- On the right, click "Activate", then "Continue". Authenticate with ALCF. +This routine will wait for corresponding statfiles to be transferred back from +the remote machine, then cancel the allocation. diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py index 648cba37b..cc62d4008 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py @@ -17,10 +17,11 @@ # Use Globus to transfer output forces.stat files back GLOBUS_ENDPOINT = "jln_laptop" -GLOBUS_DEST_DIR = ( - "/Users/jnavarro/Desktop/libensemble" - + "/libensemble/libensemble/tests/scaling_tests/forces/balsam_forces/ensemble" -) + +if not THIS_SCRIPT_ON_THETA: + GLOBUS_DEST_DIR_PREFIX = os.getcwd() + "/ensemble" +else: + GLOBUS_DEST_DIR_PREFIX = "/path/to/remote/ensemble/directory" # Parse number of workers, comms type, etc. from arguments nworkers, is_manager, libE_specs, _ = parse_args() @@ -32,7 +33,7 @@ "out": [("energy", float)], # Name, type of output from sim_f "user": { "globus_endpoint": GLOBUS_ENDPOINT, - "globus_dest_dir": GLOBUS_DEST_DIR, + "globus_dest_dir": GLOBUS_DEST_DIR_PREFIX, "this_script_on_theta": THIS_SCRIPT_ON_THETA, }, } diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/submit_libe_forces_remotely.py b/libensemble/tests/scaling_tests/forces/balsam_forces/submit_libe_forces_remotely.py index b236cf51a..3e4a52573 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/submit_libe_forces_remotely.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/submit_libe_forces_remotely.py @@ -26,30 +26,15 @@ LIBE_NODES = 1 LIBE_RANKS = 5 -# Parameter file for calling script. Must be transferred to Balsam site. -# globus_endpoint_key:/path/to/file -# globus_endpoint_key specified in BALSAM_SITE's settings.yml -TRANSFER_CONFIG_FILE = False -INPUT_FILE = ( - "jln_laptop:/Users/jnavarro/Desktop/libensemble" - + "/libensemble/libensemble/tests/scaling_tests/balsam_forces/balsam_forces.yaml" -) - -# Transfer forces.stat files back to the following local destination? -# If True, this script cancels remote allocation once SIM_MAX statfiles transferred -TRANSFER_STATFILES = False +# This script cancels remote allocation once SIM_MAX statfiles transferred TRANSFER_DESTINATION = "./ensemble" -SIM_MAX = 16 # must match balsam_forces.yaml +SIM_MAX = 16 # Retrieve the libEnsemble app from the Balsam service apps = ApplicationDefinition.load_by_site(BALSAM_SITE) RemoteLibensembleApp = apps["RemoteLibensembleApp"] RemoteLibensembleApp.resolve_site_id() -if TRANSFER_CONFIG_FILE: - transfers = {"input_file": INPUT_FILE} -else: - transfers = {} # Submit the libEnsemble app as a Job to the Balsam service. # It will wait for a compatible, running BatchJob session (remote allocation) @@ -57,7 +42,6 @@ workdir="libe_workflow", num_nodes=LIBE_NODES, ranks_per_node=LIBE_RANKS, - transfers=transfers, ) print("libEnsemble App retrieved and submitted as Job to Balsam service.") @@ -75,16 +59,15 @@ print("BatchJob session initialized. All Balsam apps will run in this BatchJob.") # Wait for all forces.stat files to be transferred back, then cancel the BatchJob -if TRANSFER_STATFILES: - os.makedirs(TRANSFER_DESTINATION, exist_ok=True) - print("Waiting for all returned forces.stat files...") +os.makedirs(TRANSFER_DESTINATION, exist_ok=True) +print("Waiting for all returned forces.stat files...") - while len(glob.glob(os.path.abspath(TRANSFER_DESTINATION) + "/*.stat")) != SIM_MAX: - time.sleep(3) +while len(glob.glob(os.path.abspath(TRANSFER_DESTINATION) + "/*.stat")) != SIM_MAX: + time.sleep(3) - print("All forces.stat files returned. Cancelling BatchJob session.") +print("All forces.stat files returned. Cancelling BatchJob session.") - batch.state = "pending_deletion" - batch.save() +batch.state = "pending_deletion" +batch.save() - print("BatchJob session cancelled. Success!") +print("BatchJob session cancelled. Success!") From ecfb76e927653768e283cb38b7a2d7d1eaad4f8c Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 20 Apr 2022 14:54:29 -0500 Subject: [PATCH 89/93] flake8 --- .../scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py b/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py index cc62d4008..b6f836b8b 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/run_libe_forces_balsam.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +import os import socket import numpy as np From 6a6d6540e6235e2db9014e85c30d73e5a6c81ae1 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 21 Apr 2022 15:03:09 -0500 Subject: [PATCH 90/93] fix workdir concatenation, add additional catchable error in __init__.py, undo forces.c change adding nparticles to forces.stat filename --- libensemble/executors/__init__.py | 4 ++-- .../scaling_tests/forces/balsam_forces/forces_simf.py | 2 +- libensemble/tests/scaling_tests/forces/forces_app/forces.c | 7 +++---- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/libensemble/executors/__init__.py b/libensemble/executors/__init__.py index 058145f37..85cdcf430 100644 --- a/libensemble/executors/__init__.py +++ b/libensemble/executors/__init__.py @@ -7,11 +7,11 @@ if pkg_resources.get_distribution("balsam"): # Balsam 0.7.0 onward (Balsam 2) from libensemble.executors.balsam_executor import BalsamExecutor -except (ModuleNotFoundError, pkg_resources.DistributionNotFound): +except (ModuleNotFoundError, ImportError, pkg_resources.DistributionNotFound): try: if pkg_resources.get_distribution("balsam-flow"): # Balsam up through 0.5.0 from libensemble.executors.legacy_balsam_executor import LegacyBalsamMPIExecutor - except (ModuleNotFoundError, pkg_resources.DistributionNotFound): + except (ModuleNotFoundError, ImportError, pkg_resources.DistributionNotFound): pass diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py index 230fcd6f8..48c89f233 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py @@ -24,7 +24,7 @@ def run_forces_balsam(H, persis_info, sim_specs, libE_info): "seed": particles, } - workdir = "sim" + str(libE_info["H_rows"]) + "_worker" + str(libE_info["workerID"]) + workdir = "sim" + str(libE_info["H_rows"][0]) + "_worker" + str(libE_info["workerID"]) statfile = "forces{}.stat".format(particles) diff --git a/libensemble/tests/scaling_tests/forces/forces_app/forces.c b/libensemble/tests/scaling_tests/forces/forces_app/forces.c index 6c4cb1caf..d0f2ce717 100755 --- a/libensemble/tests/scaling_tests/forces/forces_app/forces.c +++ b/libensemble/tests/scaling_tests/forces/forces_app/forces.c @@ -248,9 +248,8 @@ int print_step_summary(int step, double total_en, return 0; } -int open_stat_file(num_particles) { - char *statfile; - asprintf(&statfile, "forces%d.stat", num_particles); +int open_stat_file() { + char *statfile = "forces.stat"; stat_fp = fopen(statfile, "w"); if(stat_fp == NULL) { printf("Error opening statfile"); @@ -394,7 +393,7 @@ int main(int argc, char **argv) { fflush(stdout); if (rank == 0) { - open_stat_file(num_particles); + open_stat_file(); } gettimeofday(&tstart, NULL); From 799937779d450a80244ce0c5a35b5ad0969768cf Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 21 Apr 2022 15:05:59 -0500 Subject: [PATCH 91/93] small adjusts for undoing of nparticles in forces.stat filename --- docs/tutorials/executor_forces_tutorial.rst | 4 ++-- .../scaling_tests/forces/forces_simple/forces_simf_simple.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/tutorials/executor_forces_tutorial.rst b/docs/tutorials/executor_forces_tutorial.rst index b742a98bd..6027e713f 100644 --- a/docs/tutorials/executor_forces_tutorial.rst +++ b/docs/tutorials/executor_forces_tutorial.rst @@ -241,7 +241,7 @@ and evaluated in a variety of helpful ways. For now, we're satisfied with waitin for the task to complete via ``task.wait()``. We can assume that afterward, any results are now available to parse. Our application -produces a ``forces[particles].stat`` file that contains either energy +produces a ``forces.stat`` file that contains either energy computations for every time-step or a "kill" message if particles were lost, which indicates a failed simulation. @@ -254,7 +254,7 @@ to ``WORKER_DONE``. Otherwise, send back ``NAN`` and a ``TASK_FAILED`` status: :linenos: # Stat file to check for bad runs - statfile = "forces{}.stat".format(particles) + statfile = "forces.stat" # Try loading final energy reading, set the sim's status try: diff --git a/libensemble/tests/scaling_tests/forces/forces_simple/forces_simf_simple.py b/libensemble/tests/scaling_tests/forces/forces_simple/forces_simf_simple.py index 960f779fa..a39666074 100644 --- a/libensemble/tests/scaling_tests/forces/forces_simple/forces_simf_simple.py +++ b/libensemble/tests/scaling_tests/forces/forces_simple/forces_simf_simple.py @@ -26,7 +26,7 @@ def run_forces(H, persis_info, sim_specs, libE_info): task.wait(timeout=60) # Stat file to check for bad runs - statfile = "forces{}.stat".format(particles) + statfile = "forces.stat" # Try loading final energy reading, set the sim's status try: From b4fa0bcc36d073e674cf528b3d376f669ce84c68 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 21 Apr 2022 15:49:48 -0500 Subject: [PATCH 92/93] fix statfile transfer path --- .../tests/scaling_tests/forces/balsam_forces/forces_simf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py index 48c89f233..4b64caf69 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py @@ -34,7 +34,7 @@ def run_forces_balsam(H, persis_info, sim_specs, libE_info): "../" + workdir + "/" + transfer_statfile_path.split("/")[-1] ) else: - transfer_statfile_path = os.getcwd() + statfile + transfer_statfile_path = os.getcwd() + "/" + statfile local_statfile_path = transfer_statfile_path transfer = {"result": GLOBUS_ENDPOINT + ":" + transfer_statfile_path} From 4ccb911a3d8a2ace014ef1d73fbcffcdc326f86a Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 21 Apr 2022 17:10:05 -0500 Subject: [PATCH 93/93] try new loop that sleeps at least once, then checks if transferred statfile exists and isn't empty --- .../scaling_tests/forces/balsam_forces/forces_simf.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py index 4b64caf69..97da5e28d 100644 --- a/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py +++ b/libensemble/tests/scaling_tests/forces/balsam_forces/forces_simf.py @@ -24,7 +24,9 @@ def run_forces_balsam(H, persis_info, sim_specs, libE_info): "seed": particles, } - workdir = "sim" + str(libE_info["H_rows"][0]) + "_worker" + str(libE_info["workerID"]) + workdir = ( + "sim" + str(libE_info["H_rows"][0]) + "_worker" + str(libE_info["workerID"]) + ) statfile = "forces{}.stat".format(particles) @@ -55,8 +57,13 @@ def run_forces_balsam(H, persis_info, sim_specs, libE_info): print("Task {} polled. state: {}.".format(task.name, task.state)) - while not os.path.lexists(local_statfile_path): + while True: time.sleep(1) + if ( + os.path.isfile(local_statfile_path) + and os.path.getsize(local_statfile_path) > 0 + ): + break try: data = np.loadtxt(local_statfile_path)