Skip to content

Commit

Permalink
Use allocation queue name for naming allocations
Browse files Browse the repository at this point in the history
  • Loading branch information
Kobzol committed May 22, 2024
1 parent 4960e29 commit a36fcc2
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 11 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

* Enable passing of empty `stdout`/`stderr` to Python function tasks in the Python
API (https://github.com/It4innovations/hyperqueue/issues/691).
* `hq alloc add --name <name>` will now correctly use the passed `<name>` to name allocations submitted to Slurm/PBS.

# v0.18.0

Expand Down
8 changes: 8 additions & 0 deletions crates/hyperqueue/src/server/autoalloc/queue/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@ pub fn create_allocation_dir(
Ok(dir)
}

/// Creates a name for an external allocation, based on the allocation counter
/// and an optional name prefix.
pub fn format_allocation_name(name: Option<String>, queue_id: u32, allocation_id: u64) -> String {
let mut name = name.unwrap_or_else(|| format!("hq-{queue_id}"));
name.push_str(&format!("-{allocation_id}"));
name
}

/// Submits a script into PBS/Slurm and creates debug information in the given allocation `directory`.
pub async fn submit_script<F>(
script: String,
Expand Down
6 changes: 3 additions & 3 deletions crates/hyperqueue/src/server/autoalloc/queue/pbs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ use crate::common::manager::info::ManagerType;
use crate::common::manager::pbs::{format_pbs_duration, parse_pbs_datetime};
use crate::common::utils::time::local_to_system_time;
use crate::server::autoalloc::queue::common::{
build_worker_args, check_command_output, create_allocation_dir, create_command, submit_script,
wrap_worker_cmd, ExternalHandler,
build_worker_args, check_command_output, create_allocation_dir, create_command,
format_allocation_name, submit_script, wrap_worker_cmd, ExternalHandler,
};
use crate::server::autoalloc::queue::{
AllocationExternalStatus, AllocationStatusMap, AllocationSubmissionResult, QueueHandler,
Expand Down Expand Up @@ -64,7 +64,7 @@ impl QueueHandler for PbsHandler {
let script = build_pbs_submit_script(
worker_count,
timelimit,
&format!("hq-alloc-{queue_id}"),
&format_allocation_name(name, queue_id, allocation_num),
&directory.join("stdout").display().to_string(),
&directory.join("stderr").display().to_string(),
&queue_info.additional_args.join(" "),
Expand Down
6 changes: 3 additions & 3 deletions crates/hyperqueue/src/server/autoalloc/queue/slurm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ use crate::common::manager::slurm::{
};
use crate::common::utils::time::local_to_system_time;
use crate::server::autoalloc::queue::common::{
build_worker_args, create_allocation_dir, create_command, submit_script, wrap_worker_cmd,
ExternalHandler,
build_worker_args, create_allocation_dir, create_command, format_allocation_name,
submit_script, wrap_worker_cmd, ExternalHandler,
};
use crate::server::autoalloc::queue::{
common, AllocationExternalStatus, AllocationStatusMap, AllocationSubmissionResult,
Expand Down Expand Up @@ -67,7 +67,7 @@ impl QueueHandler for SlurmHandler {
let script = build_slurm_submit_script(
worker_count,
timelimit,
&format!("hq-alloc-{queue_id}"),
&format_allocation_name(name, queue_id, allocation_num),
&working_dir.join("stdout").display().to_string(),
&working_dir.join("stderr").display().to_string(),
&queue_info.additional_args.join(" "),
Expand Down
31 changes: 27 additions & 4 deletions tests/autoalloc/test_autoalloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def test_pbs_queue_qsub_args(hq_env: HqEnv):
pbs_args = extract_script_args(data, "#PBS")
assert pbs_args == [
"-l select=1",
"-N hq-alloc-1",
"-N hq-1-1",
f"-o {join(dirname(qsub_script_path), 'stdout')}",
f"-e {join(dirname(qsub_script_path), 'stderr')}",
"-l walltime=00:03:00",
Expand All @@ -246,10 +246,10 @@ def test_slurm_queue_sbatch_args(hq_env: HqEnv):
sbatch_script_path = queue.get()
with open(sbatch_script_path) as f:
data = f.read()
pbs_args = extract_script_args(data, "#SBATCH")
assert pbs_args == [
slurm_args = extract_script_args(data, "#SBATCH")
assert slurm_args == [
"--nodes=1",
"--job-name=hq-alloc-1",
"--job-name=hq-1-1",
f"--output={join(dirname(sbatch_script_path), 'stdout')}",
f"--error={join(dirname(sbatch_script_path), 'stderr')}",
"--time=00:03:00",
Expand Down Expand Up @@ -908,6 +908,29 @@ def test_external_slurm_submit_multiple_workers(cluster_hq_env: HqEnv, slurm_cre
wait_for_job_state(cluster_hq_env, 1, "FINISHED")


def test_slurm_allocation_name(hq_env: HqEnv):
queue = ManagerQueue()
handler = ExtractSubmitScriptPath(queue, SlurmManager())

def check_name(path: str, name: str):
with open(path) as f:
data = f.read()
slurm_args = extract_script_args(data, "#SBATCH")
for arg in slurm_args:
if "--job-name=" in arg:
assert arg[len("--job-name=") :] == name
return
raise Exception(f"Slurm name {name} not found in {path}")

with MockJobManager(hq_env, adapt_slurm(handler)):
hq_env.start_server()
prepare_tasks(hq_env)

add_queue(hq_env, manager="slurm", name="foo", backlog=2)
check_name(queue.get(), "foo-1")
check_name(queue.get(), "foo-2")


def wait_for_alloc(hq_env: HqEnv, state: str, allocation_id: str, timeout=DEFAULT_TIMEOUT):
"""
Wait until an allocation has the given `state`.
Expand Down
2 changes: 1 addition & 1 deletion tests/autoalloc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def extract_script_commands(script: str) -> str:
def add_queue(
hq_env: HqEnv,
manager: ManagerType,
name: Optional[str] = "foo",
name: Optional[str] = None,
backlog=1,
workers_per_alloc=1,
additional_worker_args: List[str] = None,
Expand Down

0 comments on commit a36fcc2

Please sign in to comment.