Skip to content

Commit

Permalink
Merge pull request #244 from Libensemble/debugging/balsam_no-file
Browse files Browse the repository at this point in the history
Debugging/balsam File Not Found Travis bug
  • Loading branch information
shuds13 committed Sep 12, 2019
2 parents 1dd35fb + 4cc5610 commit 1b7f754
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 26 deletions.
6 changes: 3 additions & 3 deletions conda/configure-balsam-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ export THIS_DIR=$PWD
export SCRIPT_BASENAME=script_test_balsam_hworld

# Set proper permissions, initialize Balsam DB, activate DB
export BALSAM_DB_PATH='~/test-balsam'
export BALSAM_DB_PATH=$HOME/test-balsam
sudo chown -R postgres:postgres /var/run/postgresql
sudo chmod a+w /var/run/postgresql
balsam init ~/test-balsam
sudo chmod -R 700 ~/test-balsam/balsamdb
balsam init $HOME/test-balsam
sudo chmod -R 700 $HOME/test-balsam/balsamdb
source balsamactivate test-balsam

# Refresh DB
Expand Down
49 changes: 33 additions & 16 deletions conda/test_balsam_hworld.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import subprocess
import os
import time
import sys
import libensemble
from libensemble.tests.regression_tests.common import modify_Balsam_worker
from libensemble.tests.regression_tests.common import modify_Balsam_worker, modify_Balsam_hostprint

# TESTSUITE_COMMS: local
# TESTSUITE_NPROCS: 3

# This test is NOT submitted as a job to Balsam. Instead, script_test_balsam_hworld.py
# This test executes that job through the 'runstr' line in run_Balsam_job()
# This test is NOT submitted as a job to Balsam. script_test_balsam_hworld.py is
# the executable submitted to Balsam as a job. This test executes that job
# through the 'runstr' line in run_Balsam_job()


def run_Balsam_job():
Expand All @@ -27,8 +27,7 @@ def wait_for_job_dir(basedb):

print('Waiting for Job Directory'.format(sleeptime))
while len(os.listdir(basedb)) == 0 and sleeptime < 15:
print('{}'.format(sleeptime), end=" ")
sys.stdout.flush()
print(sleeptime, end=" ", flush=True)
time.sleep(1)
sleeptime += 1

Expand All @@ -44,10 +43,9 @@ def wait_for_job_output(jobdir):
print('Checking for Balsam output file: {}'.format(output))

while not os.path.isfile(output) and sleeptime < 30:
print('{}'.format(sleeptime), end=" ")
sys.stdout.flush()
time.sleep(2)
sleeptime += 2
print(sleeptime, end=" ", flush=True)
time.sleep(1)
sleeptime += 1

return output

Expand All @@ -56,7 +54,9 @@ def print_job_output(outscript):
sleeptime = 0

print('Output file found. Waiting for complete Balsam Job Output.')
lastlines = ['Job 4 done on worker 1\n', 'Job 4 done on worker 2\n']
lastlines = ['Job 4 done on worker 1\n', 'Job 4 done on worker 2\n',
'Run completed.\n']

lastposition = 0

while sleeptime < 60:
Expand All @@ -66,10 +66,9 @@ def print_job_output(outscript):
lastposition = f.tell()

if len(new) > 0:
print(new)
print(new, flush=True)
else:
print('{}'.format(sleeptime), end=" ")
sys.stdout.flush()
print(sleeptime, end=" ", flush=True)

if any(new.endswith(line) for line in lastlines):
break
Expand All @@ -92,13 +91,14 @@ def move_job_coverage(jobdir):

if __name__ == '__main__':

# For Balsam-specific Coverage config file, to not evaluate Balsam data dir
# Used by Balsam Coverage config file. Dont evaluate Balsam data dir
libepath = os.path.dirname(libensemble.__file__)
os.environ['LIBE_PATH'] = libepath

basedb = os.path.expanduser('~/test-balsam/data/libe_test-balsam')
basedb = os.environ['HOME'] + '/test-balsam/data/libe_test-balsam'

modify_Balsam_worker()
modify_Balsam_hostprint()
run_Balsam_job()

jobdir = wait_for_job_dir(basedb)
Expand All @@ -107,3 +107,20 @@ def move_job_coverage(jobdir):
move_job_coverage(jobdir)

print('Test complete.')


# IN BALSAM LOG:

# 11-Sep-2019 14:36:27|7301| ERROR|balsam:47] Uncaught Exception <class 'ValueError'>: Cooley WorkerGroup needs workers_file to setup
# Traceback (most recent call last):
# File "/home/travis/build/Libensemble/balsam/balsam/launcher/launcher.py", line 443, in <module>
# main(args)
# File "/home/travis/build/Libensemble/balsam/balsam/launcher/launcher.py", line 422, in main
# launcher = Launcher(wf_filter, timelimit_min, gpus_per_node)
# File "/home/travis/build/Libensemble/balsam/balsam/launcher/launcher.py", line 104, in __init__
# self.worker_group = worker.WorkerGroup()
# File "/home/travis/build/Libensemble/balsam/balsam/launcher/worker.py", line 50, in __init__
# self.setup()
# File "/home/travis/build/Libensemble/balsam/balsam/launcher/worker.py", line 112, in setup_COOLEY
# raise ValueError("Cooley WorkerGroup needs workers_file to setup")
# ValueError: Cooley WorkerGroup needs workers_file to setup
40 changes: 40 additions & 0 deletions libensemble/tests/regression_tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,43 @@ def modify_Balsam_pyCoverage():
with open(balsam_commands_path, 'w') as f:
for line in lines:
f.write(line)


def modify_Balsam_hostprint():
# Also modify Balsam Worker & Worker Gropu to print Host type (for debugging
# purposes). Balsam test bug may be caused by setup_COOLEY() being called
# instead of setup_DEFAULT() within Balsam's worker.py
import balsam

print_lines = {"host": " print('HOST TYPE: ', self.host_type)\n",
"COOLEY": " print('IN setup_COOLEY')\n",
"DEFAULT": " print('IN setup_DEFAULT')\n"}

host_prior_lines = [" self.host_type = JobEnv.host_type\n",
" self.host_type = host_type\n"]

setup_prior_lines = [" def setup_COOLEY(self):\n",
" def setup_DEFAULT(self):\n"]

workerfile = 'worker.py'
balsam_path = os.path.dirname(balsam.__file__) + '/launcher'
balsam_worker_path = os.path.join(balsam_path, workerfile)

with open(balsam_worker_path, 'r') as f:
lines = f.readlines()

newlines = []
for line in lines:
if line in print_lines.values():
continue
newlines.append(line) # Line of code from prior
if line in host_prior_lines: #
newlines.append(print_lines['host'])
elif line == setup_prior_lines[0]:
newlines.append(print_lines['COOLEY'])
elif line == setup_prior_lines[1]:
newlines.append(print_lines['DEFAULT'])

with open(balsam_worker_path, 'w') as f:
for line in newlines:
f.write(line)
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import os
import numpy as np
import multiprocessing
import mpi4py
from mpi4py import MPI

Expand Down Expand Up @@ -31,12 +30,6 @@ def build_simfunc():
is_master = MPI.COMM_WORLD.Get_rank() == 0

cores_per_job = 1
logical_cores = multiprocessing.cpu_count()
cores_all_jobs = nworkers*cores_per_job

if is_master:
print('\nCores req: {} Cores avail: {}\n'.format(cores_all_jobs,
logical_cores))

sim_app = './my_simjob.x'
if not os.path.isfile(sim_app):
Expand Down

0 comments on commit 1b7f754

Please sign in to comment.