Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change worker_list file to node_list #455

Merged
merged 3 commits into from
May 15, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 8 additions & 10 deletions examples/libE_submission_scripts/bebop_submit_slurm_distrib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
# Workers are evenly spread over nodes and manager added to the first node.
# Requires even distribution - either multiple workers per node or nodes per worker
# Option for manager to have a dedicated node.
# Use of executor will ensure workers co-locate tasks with workers
# Use of MPI Executor will ensure workers co-locate tasks with workers
# If node_list file is kept, this informs libe of resources. Else, libe auto-detects.

# User to edit these variables
export EXE=libE_calling_script.py
export NUM_WORKERS=4
export MANAGER_NODE=false # true = Manager has a dedicated node (assign one extra)
export USE_NODE_LIST=true # If false, allow libE to determine node_list from environment.

# As libE shares nodes with user applications allow fallback if contexts overrun.
unset I_MPI_FABRICS
Expand Down Expand Up @@ -50,25 +52,21 @@ echo -e "Directory is: $PWD"
# Generate a node list with 1 node per line:
srun hostname | sort -u > node_list

# Generate list of nodes for workers
if [[ $MANAGER_NODE = "true" ]]; then
tail -n +2 node_list > worker_list
else
cp node_list worker_list
fi

# Add manager node to machinefile
head -n 1 node_list > machinefile.$SLURM_JOBID

# Add worker nodes to machinefile
if [[ $SUB_NODE_WORKERS = "true" ]]; then
awk -v repeat=$WORKERS_PER_NODE '{for(i=0;i<repeat;i++)print}' worker_list \
awk -v repeat=$WORKERS_PER_NODE '{for(i=0;i<repeat;i++)print}' node_list \
>>machinefile.$SLURM_JOBID
else
awk -v patt="$NODES_PER_WORKER" 'NR % patt == 1' worker_list \
awk -v patt="$NODES_PER_WORKER" 'NR % patt == 1' node_list \
>> machinefile.$SLURM_JOBID
fi;

if [[ $USE_NODE_LIST = "false" ]]; then
rm node_list

# Put in a timestamp
echo Starting executation at: `date`

Expand Down
8 changes: 4 additions & 4 deletions libensemble/executors/mpi_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,24 +66,24 @@ def __init__(self, auto_resources=True,
nodelist_env_slurm: String, optional
The environment variable giving a node list in Slurm format
(Default: Uses SLURM_NODELIST). Note: This is queried only if
a worker_list file is not provided and auto_resources=True.
a node_list file is not provided and auto_resources=True.

nodelist_env_cobalt: String, optional
The environment variable giving a node list in Cobalt format
(Default: Uses COBALT_PARTNAME) Note: This is queried only
if a worker_list file is not provided and
if a node_list file is not provided and
auto_resources=True.

nodelist_env_lsf: String, optional
The environment variable giving a node list in LSF format
(Default: Uses LSB_HOSTS) Note: This is queried only
if a worker_list file is not provided and
if a node_list file is not provided and
auto_resources=True.

nodelist_env_lsf_shortform: String, optional
The environment variable giving a node list in LSF short-form
format (Default: Uses LSB_MCPU_HOSTS) Note: This is queried only
if a worker_list file is not provided and auto_resources=True.
if a node_list file is not provided and auto_resources=True.

custom_info: dict, optional
Provide custom overrides to selected variables that are usually
Expand Down
8 changes: 4 additions & 4 deletions libensemble/resources/env_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,19 @@ def __init__(self,

nodelist_env_slurm: String, optional
The environment variable giving a node list in Slurm format (Default: uses SLURM_NODELIST).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.

nodelist_env_cobalt: String, optional
The environment variable giving a node list in Cobalt format (Default: uses COBALT_PARTNAME).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.

nodelist_env_lsf: String, optional
The environment variable giving a node list in LSF format (Default: uses LSB_HOSTS).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.

nodelist_env_lsf_shortform: String, optional
The environment variable giving a node list in LSF short-form format (Default: uses LSB_MCPU_HOSTS).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.
"""

self.schedular = None
Expand Down
14 changes: 7 additions & 7 deletions libensemble/resources/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class Resources:

These are set on initialization.

:ivar string top_level_dir: Directory where searches for worker_list file
:ivar string top_level_dir: Directory where searches for node_list file
:ivar boolean central_mode: If true, then running in central mode; otherwise distributed
:ivar EnvResources env_resources: An object storing environment variables used by resources
:ivar list global_nodelist: A list of all nodes available for running user applications
Expand All @@ -39,7 +39,7 @@ class Resources:
:ivar WorkerResources worker_resources: An object that can contain worker specific resources
"""

DEFAULT_NODEFILE = 'worker_list'
DEFAULT_NODEFILE = 'node_list'

def __init__(self, top_level_dir=None,
central_mode=False,
Expand Down Expand Up @@ -86,24 +86,24 @@ def __init__(self, top_level_dir=None,

node_file: String, optional
If supplied, give the name of a file in the run directory to use as a node-list
for use by libEnsemble. Defaults to a file named 'worker_list'. If the file does
for use by libEnsemble. Defaults to a file named 'node_list'. If the file does
not exist, then the node-list will be auto-detected.

nodelist_env_slurm: String, optional
The environment variable giving a node list in Slurm format (Default: uses SLURM_NODELIST).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.

nodelist_env_cobalt: String, optional
The environment variable giving a node list in Cobalt format (Default: uses COBALT_PARTNAME).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.

nodelist_env_lsf: String, optional
The environment variable giving a node list in LSF format (Default: uses LSB_HOSTS).
Note: This is queried only if a worker_list file is not provided and auto_resources=True.
Note: This is queried only if a node_list file is not provided and auto_resources=True.

nodelist_env_lsf_shortform: String, optional
The environment variable giving a node list in LSF short-form format (Default: uses LSB_MCPU_HOSTS)
Note: This is only queried if a worker_list file is not provided and auto_resources=True.
Note: This is only queried if a node_list file is not provided and auto_resources=True.

"""

Expand Down
36 changes: 18 additions & 18 deletions libensemble/tests/unit_tests/test_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@

def setup_standalone_run():
os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = ""
if os.path.isfile('worker_list'):
os.remove('worker_list')
if os.path.isfile('node_list'):
os.remove('node_list')


def teardown_standalone_run():
os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = ""
if os.path.isfile('worker_list'):
os.remove('worker_list')
if os.path.isfile('node_list'):
os.remove('node_list')


def setup_function(function):
Expand All @@ -23,8 +23,8 @@ def setup_function(function):
# del os.environ['LIBE_RESOURCES_TEST_NODE_LIST']
# if os.environ['THIS_ENV_VARIABLE_IS_DEF_NOT_SET']:
# del os.environ['THIS_ENV_VARIABLE_IS_DEF_NOT_SET']
if os.path.isfile('worker_list'):
os.remove('worker_list')
if os.path.isfile('node_list'):
os.remove('node_list')


def teardown_function(function):
Expand All @@ -34,8 +34,8 @@ def teardown_function(function):
# del os.environ['LIBE_RESOURCES_TEST_NODE_LIST']
# if os.environ['THIS_ENV_VARIABLE_IS_DEF_NOT_SET']:
# del os.environ['THIS_ENV_VARIABLE_IS_DEF_NOT_SET']
if os.path.isfile('worker_list'):
os.remove('worker_list')
if os.path.isfile('node_list'):
os.remove('node_list')


# Tests ========================================================================================
Expand Down Expand Up @@ -96,19 +96,19 @@ def test_get_global_nodelist_standalone():


def test_get_global_nodelist_frm_wrklst_file():
# worker_list file should override env variables
# node_list file should override env variables
os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = "20-22,137-139,1234" # Should not be this
exp_out = ['knl-0019', 'knl-0021', 'knl-0022', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-2345'] # Should be this

open('worker_list', 'w').close()
open('node_list', 'w').close()
try:
_ = Resources.get_global_nodelist(rundir=os.getcwd())
except ResourcesException as e:
assert e.args[0] == 'Error. global_nodelist is empty'
else:
assert 0

with open('worker_list', 'w') as f:
with open('node_list', 'w') as f:
for node in exp_out:
f.write(node + '\n')

Expand All @@ -123,7 +123,7 @@ def test_get_global_nodelist_frm_wrklst_file():
nodelist_env_lsf_shortform="THIS_ENV_VARIABLE_IS_DEF_NOT_SET")
global_nodelist2 = Resources.get_global_nodelist(rundir=os.getcwd(), env_resources=env_resources)
assert global_nodelist2 == exp_out, "global_nodelist returned does not match expected"
os.remove('worker_list')
os.remove('node_list')


def test_remove_libE_nodes():
Expand Down Expand Up @@ -189,7 +189,7 @@ def test_get_local_nodelist_central_mode():
def test_get_local_nodelist_central_mode_remove_libE_proc():
mynode = socket.gethostname()
nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234']
with open('worker_list', 'w') as f:
with open('node_list', 'w') as f:
for i, node in enumerate(nodelist_in):
f.write(node + '\n')
if i == 3:
Expand Down Expand Up @@ -233,7 +233,7 @@ def test_get_local_nodelist_central_mode_remove_libE_proc():
local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
assert local_nodelist == exp_out[wrk], "local_nodelist returned does not match expected"

os.remove('worker_list')
os.remove('node_list')


def test_get_local_nodelist_distrib_mode_host_not_in_list():
Expand Down Expand Up @@ -262,7 +262,7 @@ def test_get_local_nodelist_distrib_mode():
mynode = socket.gethostname()
# nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234']
nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139']
with open('worker_list', 'w') as f:
with open('node_list', 'w') as f:
for i, node in enumerate(nodelist_in):
f.write(node + '\n')
if i == 3:
Expand Down Expand Up @@ -312,13 +312,13 @@ def test_get_local_nodelist_distrib_mode():

local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
assert local_nodelist == exp_out, "local_nodelist returned does not match expected"
os.remove('worker_list')
os.remove('node_list')


def test_get_local_nodelist_distrib_mode_uneven_split():
mynode = socket.gethostname()
nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234']
with open('worker_list', 'w') as f:
with open('node_list', 'w') as f:
for i, node in enumerate(nodelist_in):
f.write(node + '\n')
if i == 4:
Expand All @@ -332,7 +332,7 @@ def test_get_local_nodelist_distrib_mode_uneven_split():
exp_out = ['knl-0137', mynode, 'knl-0138', 'knl-0139']
local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
assert local_nodelist == exp_out, "local_nodelist returned does not match expected"
os.remove('worker_list')
os.remove('node_list')


class Fake_comm():
Expand Down