In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys

import nest_asyncio


sys.path.insert(0, os.path.abspath('..'))
nest_asyncio.apply()

In [3]:
from asyncio import run, sleep

from asyncssh import connect
from decouple import config


HPC_USER = config('HPC_USER')
HPC_HOST = config('HPC_HOST')

In [4]:
from contextlib import AsyncExitStack
from pathlib import Path
from typing import AsyncGenerator


class SCPClient:
    def __init__(self, host: str, user: str):
        self.host = host
        self.user = user

    async def secure_copy(
        self,
        source_stream: AsyncGenerator[bytes, None],
        target_path: Path,
    ):
        async with AsyncExitStack() as stack:
            conn = await stack.enter_async_context(
                connect(self.host, username=self.user)
            )
            sftp = await stack.enter_async_context(conn.start_sftp_client())
            file = await stack.enter_async_context(sftp.open(str(target_path), 'wb'))

            async for chunk in source_stream:
                await file.write(chunk)

In [5]:
import logging

from asyncssh import ConnectionLost, Error, connect
from backoff import expo, on_exception


class PBSProClient:
    def __init__(self, host: str, user: str):
        self.host = host
        self.user = user

    @on_exception(expo, (OSError, ConnectionLost), max_tries=4)
    async def _run(self, args: list) -> str:
        try:
            async with await connect(self.host, username=self.user) as connection:
                result = await connection.run(*args, check=False)

                if result.exit_status != 0:
                    logging.error(
                        f'SSH process with command {result.command},\n'
                        f'completed with status {result.exit_status},\n'
                        f'STDOUT: {result.stderr},\n'
                        f'STDERR: {result.stderr}'
                    )
                    raise Exception(
                        f'SSH process completed with status {result.exit_status}'
                    )

                return result.stdout.strip()

        except Error:
            raise

    async def queue_state(self):
        # qstat
        stdout = await self._run(['qstat'])
        # TODO parse

        return stdout

    async def queue_submit(self, pbs_filename: str):
        # qsub
        stdout = await self._run(['qsub', pbs_filename])
        pass

    async def queue_delete(self, job_id: int):
        # qdel
        stdout = await self._run(['qdel', str(job_id)])
        pass

    async def queue_hold(self):
        # qhold
        pass

    async def queue_release(self):
        # qrls
        pass

    async def trace_job(self):
        # tracejob
        pass

In [None]:
# limited to 80 characters (79 + newline)
len('    Error_Path = x3000c0s27b0n0.hsn.hpc.srce.hr:/lustre/home/lpanic/hello_world')

79

In [13]:
# qstat -f 465317
text = """
Job Id: 465317.x3000c0s25b0n0.hsn.hpc.srce.hr
    Job_Name = hello
    Job_Owner = lpanic@x3000c0s27b0n0.hsn.hpc.srce.hr
    resources_used.cpupercent = 0
    resources_used.cput = 00:00:00
    resources_used.mem = 0b
    resources_used.ncpus = 1
    resources_used.vmem = 0kb
    resources_used.walltime = 00:00:00
    job_state = R
    queue = cpu-single
    server = x3000c0s25b0n0.hsn.hpc.srce.hr
    Checkpoint = u
    ctime = Fri Mar 28 10:45:48 2025
    Error_Path = x3000c0s27b0n0.hsn.hpc.srce.hr:/lustre/home/lpanic/hello_world
	/hello.e465317
    exec_host = x8000c1s4b0n1/1
    exec_vnode = (x8000c1s4b0n1:mem=1843200kb:ncpus=1:ngpus=0)
    Hold_Types = n
    Join_Path = n
    Keep_Files = oed
    Mail_Points = a
    mtime = Fri Mar 28 10:45:53 2025
    Output_Path = x3000c0s27b0n0.hsn.hpc.srce.hr:/lustre/home/lpanic/hello_worl
	d/hello.o465317
    Priority = 0
    qtime = Fri Mar 28 10:45:48 2025
    Rerunable = True
    Resource_List.mem = 1800mb
    Resource_List.ncpus = 1
    Resource_List.ngpus = 0
    Resource_List.nodect = 1
    Resource_List.place = pack
    Resource_List.select = 1:mem=1800mb:ncpus=1:ngpus=0
    Resource_List.walltime = 48:00:00
    stime = Fri Mar 28 10:45:48 2025
    session_id = 3900810
    jobdir = /lustre/home/lpanic
    substate = 42
    Variable_List = PBS_O_HOME=/lustre/home/lpanic,PBS_O_LANG=en_US.UTF-8,
	PBS_O_LOGNAME=lpanic,
	PBS_O_PATH=/lustre/home/lpanic/.local/bin:/lustre/home/lpanic/bin:/opt
	/clmgr/sbin:/opt/clmgr/bin:/opt/sgi/sbin:/opt/sgi/bin:/usr/share/Module
	s/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/c3/bin:/op
	t/pbs/bin:/sbin:/bin,PBS_O_MAIL=/var/spool/mail/lpanic,
	PBS_O_SHELL=/bin/bash,PBS_O_HOST=x3000c0s27b0n0.hsn.hpc.srce.hr,
	PBS_O_WORKDIR=/lustre/home/lpanic/hello_world,PBS_O_SYSTEM=Linux,
	PBS_O_QUEUE=RouteQ
         = Job run at Fri Mar 28 at 10:45 on (x8000c1s4b0n1:mem=1843200kb:nc
	pus=1:ngpus=0)
    etime = Fri Mar 28 10:45:48 2025
    run_count = 1
    eligible_time = 00:00:05
    Submit_arguments = -koed hello.sh
    project = _pbs_project_default
    Submit_Host = x3000c0s27b0n0.hsn.hpc.srce.hr
"""

In [30]:
queue_status_full = parse_queue_status_full(text)
queue_status_full = {key.lower(): value for key, value in queue_status_full.items()}
queue_status_full

{'job_id': '465317.x3000c0s25b0n0.hsn.hpc.srce.hr',
 'job_name': 'hello',
 'job_owner': 'lpanic@x3000c0s27b0n0.hsn.hpc.srce.hr',
 'resources_used.cpupercent': '0',
 'resources_used.cput': '00:00:00',
 'resources_used.mem': '0b',
 'resources_used.ncpus': '1',
 'resources_used.vmem': '0kb',
 'resources_used.walltime': '00:00:00',
 'job_state': 'R',
 'queue': 'cpu-single',
 'server': 'x3000c0s25b0n0.hsn.hpc.srce.hr',
 'checkpoint': 'u',
 'ctime': 'Fri Mar 28 10:45:48 2025',
 'error_path': 'x3000c0s27b0n0.hsn.hpc.srce.hr:/lustre/home/lpanic/hello_world/hello.e465317',
 'exec_host': 'x8000c1s4b0n1/1',
 'exec_vnode': '(x8000c1s4b0n1:mem=1843200kb:ncpus=1:ngpus=0)',
 'hold_types': 'n',
 'join_path': 'n',
 'keep_files': 'oed',
 'mail_points': 'a',
 'mtime': 'Fri Mar 28 10:45:53 2025',
 'output_path': 'x3000c0s27b0n0.hsn.hpc.srce.hr:/lustre/home/lpanic/hello_world/hello.o465317',
 'priority': '0',
 'qtime': 'Fri Mar 28 10:45:48 2025',
 'rerunable': 'True',
 'resource_list.mem': '1800mb',
 'reso

In [54]:
pbs_pro_job = PBSProJob(**queue_status_full)
pbs_pro_job

PBSProJob(id='465317.x3000c0s25b0n0.hsn.hpc.srce.hr', name='hello', owner='lpanic@x3000c0s27b0n0.hsn.hpc.srce.hr', state='R', queue='cpu-single', server='x3000c0s25b0n0.hsn.hpc.srce.hr', checkpoint='u', exec_host='x8000c1s4b0n1/1', exec_vnode='(x8000c1s4b0n1:mem=1843200kb:ncpus=1:ngpus=0)', error_path='x3000c0s27b0n0.hsn.hpc.srce.hr:/lustre/home/lpanic/hello_world/hello.e465317', output_path='x3000c0s27b0n0.hsn.hpc.srce.hr:/lustre/home/lpanic/hello_world/hello.o465317', dir=PosixPath('/lustre/home/lpanic'), hold_types='n', join_path='n', keep_files='oed', mail_points='a', substate=42, priority=0, session_id='3900810', rerunable=True, run_count=1, submit_arguments='-koed hello.sh', project='_pbs_project_default', submit_host='x3000c0s27b0n0.hsn.hpc.srce.hr')

In [52]:
pbs_pro_time = PBSProTime(**queue_status_full)
pbs_pro_time

PBSProTime(created=datetime.datetime(2025, 3, 28, 10, 45, 48), queued=datetime.datetime(2025, 3, 28, 10, 45, 48), modified=datetime.datetime(2025, 3, 28, 10, 45, 53), started=datetime.datetime(2025, 3, 28, 10, 45, 48), eligible=datetime.datetime(2025, 3, 28, 10, 45, 48), eligible_delta=datetime.timedelta(seconds=5))

In [38]:
pbs_pro_resources_used = PBSProResourcesUsed(**queue_status_full)
pbs_pro_resources_used

PBSProResourcesUsed(cpu_percent=0, cpu_time=datetime.timedelta(0), num_cpus=1, memory=0, virtual_memory=0, walltime=datetime.timedelta(0))

In [39]:
pbs_pro_resource_list = PBSProResourceList(**queue_status_full)
pbs_pro_resource_list

PBSProResourceList(memory=1887436800, num_cpus=1, num_gpus=0, num_nodes=1, place='pack', select='1:mem=1800mb:ncpus=1:ngpus=0', walltime=datetime.timedelta(days=2))

In [None]:
env_vars = parse_variable_list(queue_status_full['variable_list'])
pbs_pro_variable_list = PBSProVariableList(**env_vars)
pbs_pro_variable_list

PBSProVariableList(home=PosixPath('/lustre/home/lpanic'), path=PosixPath('/lustre/home/lpanic/.local/bin:/lustre/home/lpanic/bin:/opt/clmgr/sbin:/opt/clmgr/bin:/opt/sgi/sbin:/opt/sgi/bin:/usr/share/Modules/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/c3/bin:/opt/pbs/bin:/sbin:/bin'), mail=PosixPath('/var/spool/mail/lpanic'), shell=PosixPath('/bin/bash'), workdir=PosixPath('/lustre/home/lpanic/hello_world'), host='x3000c0s27b0n0.hsn.hpc.srce.hr', lang='en_US.UTF-8', logname='lpanic', system='Linux', queue='RouteQ= Job run at Fri Mar 28 at 10:45 on (x8000c1s4b0n1:mem=1843200kb:ncpus=1:ngpus=0)')