Skip to content

Commit

Permalink
Merge 898b046 into b35ad94
Browse files Browse the repository at this point in the history
  • Loading branch information
shuds13 committed May 15, 2020
2 parents b35ad94 + 898b046 commit 8d175fe
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 43 deletions.
18 changes: 8 additions & 10 deletions examples/libE_submission_scripts/bebop_submit_slurm_distrib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
# Workers are evenly spread over nodes and manager added to the first node.
# Requires even distribution - either multiple workers per node or nodes per worker
# Option for manager to have a dedicated node.
# Use of executor will ensure workers co-locate tasks with workers
# Use of MPI Executor will ensure workers co-locate tasks with workers
# If node_list file is kept, this informs libe of resources. Else, libe auto-detects.

# User to edit these variables
export EXE=libE_calling_script.py
export NUM_WORKERS=4
export MANAGER_NODE=false # true = Manager has a dedicated node (assign one extra)
export USE_NODE_LIST=true # If false, allow libE to determine node_list from environment.

# As libE shares nodes with user applications allow fallback if contexts overrun.
unset I_MPI_FABRICS
Expand Down Expand Up @@ -50,25 +52,21 @@ echo -e "Directory is: $PWD"
# Generate a node list with 1 node per line:
srun hostname | sort -u > node_list

# Generate list of nodes for workers
if [[ $MANAGER_NODE = "true" ]]; then
tail -n +2 node_list > worker_list
else
cp node_list worker_list
fi

# Add manager node to machinefile
head -n 1 node_list > machinefile.$SLURM_JOBID

# Add worker nodes to machinefile
if [[ $SUB_NODE_WORKERS = "true" ]]; then
awk -v repeat=$WORKERS_PER_NODE '{for(i=0;i<repeat;i++)print}' worker_list \
awk -v repeat=$WORKERS_PER_NODE '{for(i=0;i<repeat;i++)print}' node_list \
>>machinefile.$SLURM_JOBID
else
awk -v patt="$NODES_PER_WORKER" 'NR % patt == 1' worker_list \
awk -v patt="$NODES_PER_WORKER" 'NR % patt == 1' node_list \
>> machinefile.$SLURM_JOBID
fi;

if [[ $USE_NODE_LIST = "false" ]]; then
rm node_list

# Put in a timestamp
echo Starting executation at: `date`

Expand Down
8 changes: 4 additions & 4 deletions libensemble/executors/mpi_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,24 +66,24 @@ def __init__(self, auto_resources=True,
nodelist_env_slurm: String, optional
The environment variable giving a node list in Slurm format
(Default: Uses SLURM_NODELIST). Note: This is queried only if
a worker_list file is not provided and auto_resources=True.
a node_list file is not provided and auto_resources=True.
nodelist_env_cobalt: String, optional
The environment variable giving a node list in Cobalt format
(Default: Uses COBALT_PARTNAME) Note: This is queried only
if a worker_list file is not provided and
if a node_list file is not provided and
auto_resources=True.
nodelist_env_lsf: String, optional
The environment variable giving a node list in LSF format
(Default: Uses LSB_HOSTS) Note: This is queried only
if a worker_list file is not provided and
if a node_list file is not provided and
auto_resources=True.
nodelist_env_lsf_shortform: String, optional
The environment variable giving a node list in LSF short-form
format (Default: Uses LSB_MCPU_HOSTS) Note: This is queried only
if a worker_list file is not provided and auto_resources=True.
if a node_list file is not provided and auto_resources=True.
custom_info: dict, optional
Provide custom overrides to selected variables that are usually
Expand Down
8 changes: 4 additions & 4 deletions libensemble/resources/env_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,19 @@ def __init__(self,
nodelist_env_slurm: String, optional
The environment variable giving a node list in Slurm format (Default: uses SLURM_NODELIST).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.
nodelist_env_cobalt: String, optional
The environment variable giving a node list in Cobalt format (Default: uses COBALT_PARTNAME).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.
nodelist_env_lsf: String, optional
The environment variable giving a node list in LSF format (Default: uses LSB_HOSTS).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.
nodelist_env_lsf_shortform: String, optional
The environment variable giving a node list in LSF short-form format (Default: uses LSB_MCPU_HOSTS).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.
"""

self.schedular = None
Expand Down
14 changes: 7 additions & 7 deletions libensemble/resources/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class Resources:
These are set on initialization.
:ivar string top_level_dir: Directory where searches for worker_list file
:ivar string top_level_dir: Directory where searches for node_list file
:ivar boolean central_mode: If true, then running in central mode; otherwise distributed
:ivar EnvResources env_resources: An object storing environment variables used by resources
:ivar list global_nodelist: A list of all nodes available for running user applications
Expand All @@ -39,7 +39,7 @@ class Resources:
:ivar WorkerResources worker_resources: An object that can contain worker specific resources
"""

DEFAULT_NODEFILE = 'worker_list'
DEFAULT_NODEFILE = 'node_list'

def __init__(self, top_level_dir=None,
central_mode=False,
Expand Down Expand Up @@ -86,24 +86,24 @@ def __init__(self, top_level_dir=None,
node_file: String, optional
If supplied, give the name of a file in the run directory to use as a node-list
for use by libEnsemble. Defaults to a file named 'worker_list'. If the file does
for use by libEnsemble. Defaults to a file named 'node_list'. If the file does
not exist, then the node-list will be auto-detected.
nodelist_env_slurm: String, optional
The environment variable giving a node list in Slurm format (Default: uses SLURM_NODELIST).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.
nodelist_env_cobalt: String, optional
The environment variable giving a node list in Cobalt format (Default: uses COBALT_PARTNAME).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.
nodelist_env_lsf: String, optional
The environment variable giving a node list in LSF format (Default: uses LSB_HOSTS).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.
nodelist_env_lsf_shortform: String, optional
The environment variable giving a node list in LSF short-form format (Default: uses LSB_MCPU_HOSTS)
Note: This is only queried if a worker_list file is not provided and auto_resources=True.
Note: This is only queried if a node_list file is not provided and auto_resources=True.
"""

Expand Down
36 changes: 18 additions & 18 deletions libensemble/tests/unit_tests/test_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@

def setup_standalone_run():
os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = ""
if os.path.isfile('worker_list'):
os.remove('worker_list')
if os.path.isfile('node_list'):
os.remove('node_list')


def teardown_standalone_run():
os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = ""
if os.path.isfile('worker_list'):
os.remove('worker_list')
if os.path.isfile('node_list'):
os.remove('node_list')


def setup_function(function):
Expand All @@ -23,8 +23,8 @@ def setup_function(function):
# del os.environ['LIBE_RESOURCES_TEST_NODE_LIST']
# if os.environ['THIS_ENV_VARIABLE_IS_DEF_NOT_SET']:
# del os.environ['THIS_ENV_VARIABLE_IS_DEF_NOT_SET']
if os.path.isfile('worker_list'):
os.remove('worker_list')
if os.path.isfile('node_list'):
os.remove('node_list')


def teardown_function(function):
Expand All @@ -34,8 +34,8 @@ def teardown_function(function):
# del os.environ['LIBE_RESOURCES_TEST_NODE_LIST']
# if os.environ['THIS_ENV_VARIABLE_IS_DEF_NOT_SET']:
# del os.environ['THIS_ENV_VARIABLE_IS_DEF_NOT_SET']
if os.path.isfile('worker_list'):
os.remove('worker_list')
if os.path.isfile('node_list'):
os.remove('node_list')


# Tests ========================================================================================
Expand Down Expand Up @@ -96,19 +96,19 @@ def test_get_global_nodelist_standalone():


def test_get_global_nodelist_frm_wrklst_file():
# worker_list file should override env variables
# node_list file should override env variables
os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = "20-22,137-139,1234" # Should not be this
exp_out = ['knl-0019', 'knl-0021', 'knl-0022', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-2345'] # Should be this

open('worker_list', 'w').close()
open('node_list', 'w').close()
try:
_ = Resources.get_global_nodelist(rundir=os.getcwd())
except ResourcesException as e:
assert e.args[0] == 'Error. global_nodelist is empty'
else:
assert 0

with open('worker_list', 'w') as f:
with open('node_list', 'w') as f:
for node in exp_out:
f.write(node + '\n')

Expand All @@ -123,7 +123,7 @@ def test_get_global_nodelist_frm_wrklst_file():
nodelist_env_lsf_shortform="THIS_ENV_VARIABLE_IS_DEF_NOT_SET")
global_nodelist2 = Resources.get_global_nodelist(rundir=os.getcwd(), env_resources=env_resources)
assert global_nodelist2 == exp_out, "global_nodelist returned does not match expected"
os.remove('worker_list')
os.remove('node_list')


def test_remove_libE_nodes():
Expand Down Expand Up @@ -189,7 +189,7 @@ def test_get_local_nodelist_central_mode():
def test_get_local_nodelist_central_mode_remove_libE_proc():
mynode = socket.gethostname()
nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234']
with open('worker_list', 'w') as f:
with open('node_list', 'w') as f:
for i, node in enumerate(nodelist_in):
f.write(node + '\n')
if i == 3:
Expand Down Expand Up @@ -233,7 +233,7 @@ def test_get_local_nodelist_central_mode_remove_libE_proc():
local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
assert local_nodelist == exp_out[wrk], "local_nodelist returned does not match expected"

os.remove('worker_list')
os.remove('node_list')


def test_get_local_nodelist_distrib_mode_host_not_in_list():
Expand Down Expand Up @@ -262,7 +262,7 @@ def test_get_local_nodelist_distrib_mode():
mynode = socket.gethostname()
# nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234']
nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139']
with open('worker_list', 'w') as f:
with open('node_list', 'w') as f:
for i, node in enumerate(nodelist_in):
f.write(node + '\n')
if i == 3:
Expand Down Expand Up @@ -312,13 +312,13 @@ def test_get_local_nodelist_distrib_mode():

local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
assert local_nodelist == exp_out, "local_nodelist returned does not match expected"
os.remove('worker_list')
os.remove('node_list')


def test_get_local_nodelist_distrib_mode_uneven_split():
mynode = socket.gethostname()
nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234']
with open('worker_list', 'w') as f:
with open('node_list', 'w') as f:
for i, node in enumerate(nodelist_in):
f.write(node + '\n')
if i == 4:
Expand All @@ -332,7 +332,7 @@ def test_get_local_nodelist_distrib_mode_uneven_split():
exp_out = ['knl-0137', mynode, 'knl-0138', 'knl-0139']
local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
assert local_nodelist == exp_out, "local_nodelist returned does not match expected"
os.remove('worker_list')
os.remove('node_list')


class Fake_comm():
Expand Down

0 comments on commit 8d175fe

Please sign in to comment.