In [59]:
import zmq
import time
import os
from concurrent.futures import ThreadPoolExecutor
import subprocess
import ipywidgets as ipw
import random
import tempfile
import sys

In [60]:
def gen_random_hash():
    return "%032x" % random.getrandbits(128)

In [61]:
def job_running(job_id):
    try:
        subprocess.check_output(['qstat', str(job_id)])
        return True
    except subprocess.CalledProcessError:
        return False

In [62]:
def get_tempfile():
    return subprocess.check_output('mktemp').decode().strip()

In [63]:
def create_appended_text_file(path, addition):
    fname = os.path.basename(path)
    newpath = get_tempfile()
    
    with open(path) as fh:
        content = fh.read()
        
    with open(newpath, 'w') as fh:
        fh.write(content)
        fh.write('\n')
        fh.write(addition)
        
    return newpath

In [64]:
def open_success_port():
    context = zmq.Context()
    socket = context.socket(zmq.PAIR)
    port = socket.bind_to_random_port("tcp://*")
    return socket, port

In [65]:
def create_success_file():
    fd, path = tempfile.mkstemp(prefix='.watch_job_', dir='.')
    return path

In [66]:
def get_hostname():
    return subprocess.check_output(['hostname']).decode().strip()

In [67]:
def wrap_batch_script_zmq(batch_script, success_port, randhash):
    success_uri = "tcp://{hostname}:{port}".format(
        hostname=get_hostname(),
        port=success_port
    )
    
    success_func = r"""import zmq
context = zmq.Context()
socket = context.socket(zmq.PAIR)
socket.connect('{success_uri}')
socket.send(b'{randhash}')""".format(
        success_uri=success_uri,
        randhash=randhash
    ).replace('\n','; \\\n')
    
    success_command = 'python -c "{}"'.format(
        success_func
    )
    
    new_batch_script = create_appended_text_file(
        batch_script, 
        success_command
    )
    
    return new_batch_script

In [68]:
def wrap_batch_script(batch_script, success_file, randhash):
    
    success_command = "echo '{}' > {}".format(
        randhash, success_file
    )
    
    new_batch_script = create_appended_text_file(
        batch_script, 
        success_command
    )
    
    return new_batch_script

In [69]:
def submit_batch_script(script_path):
    "Submit job, decode job_id bytes & remove newline"
    with open(script_path) as fh:
        print(fh.read())
    return subprocess.check_output(['qsub', script_path]).decode().strip()

In [70]:
def listen_for_success_zmq(socket, job_id, randhash, delay=1):
    while job_running(job_id):
        time.sleep(delay)
    
    # Job is no longer in batch queue
    try:
        message = socket.recv(zmq.NOBLOCK).decode().strip()
        if message == randhash:
            print("Job success.")
        else:
            print("Wrong hash!")
            print("Wanted '{}'".format(randhash))
            print("Received '{}'".format(message))
            sys.exit(1)
    except zmq.Again:
        # No success message means job failed
        print("Job failed.")
        sys.exit(1)

In [91]:
def poll_success_file(filepath, job_id, randhash, delay=1):
    try:
        while job_running(job_id):
            time.sleep(delay)

        # Job is no longer in batch queue
        try:
            with open(filepath) as fh:
                message = fh.read().strip()
            if message == randhash:
                print("Job success.")
            elif message == '':
                print("Job failed.")
                sys.exit(1)
            else:
                print("Wrong hash!")
                print("Wanted '{}'".format(randhash))
                print("Received '{}'".format(message))
                sys.exit(1)
        except FileNotFoundError:
            # No success message means job failed
            print("Unexpected error.")
            sys.exit(1)
    finally:
        # Always delete success file
        os.remove(filepath)

In [92]:
def run_batch_job_zmq(batch_script):
    socket, success_port = open_success_port()
    
    randhash = gen_random_hash()
    
    new_batch_script = wrap_batch_script(
        batch_script, 
        success_port,
        randhash
    )
    
    job_id = submit_batch_script(new_batch_script)
    
    listen_for_success(socket, job_id, randhash)

In [93]:
def run_batch_job(batch_script):
    success_file = create_success_file()
    randhash = gen_random_hash()
    
    new_batch_script = wrap_batch_script(
        batch_script, 
        success_file,
        randhash
    )
    
    job_id = submit_batch_script(new_batch_script)
    
    poll_success_file(success_file, job_id, randhash)

In [97]:
run_batch_job('batch_scripts/test.batch')

#!/bin/bash
#PBS -l nodes=n015:ppn=1
#PBS -l nice=10
#PBS -j oe
#PBS -q default
#PBS -N KaleTest
#PBS -r n
cd $PBS_O_WORKDIR
echo "I'm feeling sleepy."
sleep 10
echo "That was a nice nap."

echo '4401f2d58776b29438776e87cb05f6de' > /home/oge1/kale/.watch_job__wu1525u
Job success.
