Skip to content

Commit

Permalink
Merge pull request #16 from Kortemme-Lab/external_metrics
Browse files Browse the repository at this point in the history
 Refactor how jobs are run, and add external metrics.
  • Loading branch information
kalekundert committed Mar 16, 2018
2 parents 0ca2667 + c22caf0 commit b8b6a8d
Show file tree
Hide file tree
Showing 7 changed files with 253 additions and 158 deletions.
121 changes: 97 additions & 24 deletions pull_into_place/big_jobs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python2

import sys, os, re, json, time, subprocess
import sys, os, re, json, subprocess, gzip
from klab.process import tee
from . import pipeline

def submit(script, workspace, **params):
Expand All @@ -22,10 +23,9 @@ def submit(script, workspace, **params):

if test_run:
nstruct = 50
max_runtime = '2:00:00'

if nstruct is None:
raise TypeError("sumbit() requires the keyword argument 'nstruct' for production runs.")
raise TypeError("submit() requires the keyword argument 'nstruct' for production runs.")

# Submit the job and put it immediately into the hold state.

Expand All @@ -50,7 +50,7 @@ def submit(script, workspace, **params):

job_id = status_match.group(1)

with open(workspace.job_params_path(job_id), 'w') as file:
with open(workspace.job_info_path(job_id), 'w') as file:
json.dump(params, file)

# Release the hold on the job.
Expand All @@ -61,20 +61,105 @@ def submit(script, workspace, **params):

def initiate():
"""Return some relevant information about the currently running job."""
print_debug_header()

workspace = pipeline.workspace_from_dir(sys.argv[1])
workspace.cd_to_root()

job_id = int(os.environ['JOB_ID'])
task_id = int(os.environ['SGE_TASK_ID']) - 1
job_params = read_params(workspace.job_params_path(job_id))
job_info = read_job_info(workspace.job_info_path(os.environ['JOB_ID']))
job_info['job_id'] = int(os.environ['JOB_ID'])
job_info['task_id'] = int(os.environ['SGE_TASK_ID']) - 1

return workspace, job_info

def debrief():
"""
Report the amount of memory used by this job, among other things.
"""
job_number = os.environ['JOB_ID'] + '.' + os.environ['SGE_TASK_ID']
run_command(['/usr/local/sge/bin/linux-x64/qstat', '-j', job_number])

def run_rosetta(workspace, job_info,
use_resfile=False, use_restraints=False, use_fragments=False):

test_run = job_info.get('test_run', False)
rosetta_cmd = [
workspace.rosetta_scripts_path,
'-database', workspace.rosetta_database_path,
'-in:file:s', workspace.input_path(job_info),
'-in:file:native', workspace.input_path(job_info),
'-out:prefix', workspace.output_prefix(job_info),
'-out:suffix', workspace.output_suffix(job_info),
'-out:no_nstruct_label',
'-out:overwrite',
'-out:pdb_gz',
'-out:mute', 'protocols.loops.loops_main',
'-parser:protocol', workspace.protocol_path,
'-parser:script_vars',
'wts_file=' + workspace.scorefxn_path,
'cst_file=' + workspace.restraints_path,
'loop_file=' + workspace.loops_path,
'loop_start=' + str(workspace.loop_boundaries[0]),
'loop_end=' + str(workspace.loop_boundaries[1]),
'outputs_folder=' + workspace.seqprof_dir,
'design_number=' + workspace.output_basename(job_info),
'vall_path=' + workspace.rosetta_vall_path(test_run),
'fragment_weights=' + workspace.fragment_weights_path,
'fast=' + ('yes' if test_run else 'no'),
]
if use_resfile: rosetta_cmd += [
'-packing:resfile', workspace.resfile_path,
]
if use_restraints: rosetta_cmd += [
'-constraints:cst_fa_file', workspace.restraints_path,
]
if use_fragments: rosetta_cmd += \
workspace.fragments_flags(workspace.input_path(job_info))

rosetta_cmd += [
'@', workspace.flags_path,
]

run_command(rosetta_cmd)
run_external_metrics(workspace, job_info)

def run_external_metrics(workspace, job_info):
pdb_path = workspace.output_path(job_info)

for metric in workspace.metric_scripts:
command = metric, pdb_path

print "Working directory:", os.getcwd()
print "Command:", ' '.join(command)
sys.stdout.flush()

stdout, stderr = tee([metric, pdb_path])
file = gzip.open(pdb_path, 'a')

for line in stdout.strip().split('\n'):
if line.strip():
file.write('EXTRA_METRIC {0}\n'.format(line))

file.close()

def run_command(command):
print "Working directory:", os.getcwd()
print "Command:", ' '.join(command)
sys.stdout.flush()

process = subprocess.Popen(command)

print "Process ID:", process.pid
print
sys.stdout.flush()

return workspace, job_id, task_id, job_params
process.wait()

def read_params(params_path):
with open(params_path) as file:
def read_job_info(json_path):
with open(json_path) as file:
return json.load(file)

def print_debug_info():
def print_debug_header():
from datetime import datetime
from socket import gethostname

Expand All @@ -86,17 +171,5 @@ def print_debug_info():
print
sys.stdout.flush()

def run_command(command):
print "Working directory:", os.getcwd()
print "Command:", ' '.join(command)
sys.stdout.flush()

process = subprocess.Popen(command)

print "Process ID:", process.pid
print
sys.stdout.flush()
process.wait()
jobnumber = os.environ['JOB_ID'] + '.' + os.environ['SGE_TASK_ID']
print 'Job Number:', jobnumber
subprocess.call(['/usr/local/sge/bin/linux-x64/qstat','-j',jobnumber])

41 changes: 8 additions & 33 deletions pull_into_place/big_jobs/pip_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,13 @@
#$ -l netapp=1G
#$ -cwd

import os, sys, subprocess
from pull_into_place import big_jobs

workspace, job_id, task_id, parameters = big_jobs.initiate()
output_prefix = '{0}/{1}_{2:06d}_'.format(workspace.output_dir, job_id, task_id)
test_run = parameters.get('test_run', False)

big_jobs.print_debug_info()
big_jobs.run_command([
workspace.rosetta_scripts_path,
'-database', workspace.rosetta_database_path,
'-in:file:s', workspace.input_pdb_path,
'-in:file:native', workspace.input_pdb_path,
'-out:prefix', output_prefix,
'-out:no_nstruct_label',
'-out:overwrite',
'-out:pdb_gz',
'-out:mute', 'protocols.loops.loops_main',
'-parser:protocol', workspace.build_script_path,
'-parser:script_vars',
'wts_file=' + workspace.scorefxn_path,
'cst_file=' + workspace.restraints_path,
'loop_file=' + workspace.loops_path,
'fast=' + ('yes' if test_run else 'no'),
'loop_start=' + str(workspace.loop_boundaries[0]),
'loop_end=' + str(workspace.loop_boundaries[1]),
'outputs_folder=' + workspace.seqprof_dir,
'design_number=' + '{0}_{1:06d}'.format(job_id,task_id),
'vall_path=' + (workspace.rosetta_vall_path(test_run)),
'fragment_weights=' + workspace.fragment_weights_path,
'-packing:resfile', workspace.resfile_path,
'-constraints:cst_fa_file', workspace.restraints_path,
] + workspace.fragments_flags(workspace.input_pdb_path) + [
'@', workspace.flags_path,
])
workspace, job_info = big_jobs.initiate()
big_jobs.run_rosetta(
workspace, job_info,
use_resfile=True,
use_restraints=True,
use_fragments=True,
)
big_jobs.debrief()
40 changes: 8 additions & 32 deletions pull_into_place/big_jobs/pip_design.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,38 +7,14 @@
#$ -l h_core=0
#$ -cwd


import os, sys, subprocess
from pull_into_place import big_jobs

workspace, job_id, task_id, parameters = big_jobs.initiate()
test_run = parameters.get('test_run', False)

bb_models = parameters['inputs']
bb_model = bb_models[task_id % len(bb_models)]
design_id = task_id // len(bb_models)
workspace, job_info = big_jobs.initiate()

big_jobs.print_debug_info()
big_jobs.run_command([
workspace.rosetta_scripts_path,
'-database', workspace.rosetta_database_path,
'-in:file:s', workspace.input_path(bb_model),
'-in:file:native', workspace.input_pdb_path,
'-out:prefix', workspace.output_dir + '/',
'-out:suffix', '_{0:03}'.format(design_id),
'-out:no_nstruct_label',
'-out:overwrite',
'-out:pdb_gz',
'-parser:protocol', workspace.design_script_path,
'-parser:script_vars',
'wts_file=' + workspace.scorefxn_path,
'cst_file=' + workspace.restraints_path,
'loop_start=' + str(workspace.loop_boundaries[0]),
'loop_end=' + str(workspace.loop_boundaries[1]),
'outputs_folder=' + workspace.seqprof_dir,
'design_number=' + bb_model + '_{0:03}'.format(design_id),
'vall_path=' + (workspace.rosetta_vall_path(test_run)),
'fragment_weights=' + workspace.fragment_weights_path,
'-packing:resfile', workspace.resfile_path,
'@', workspace.flags_path,
])
# I wasn't able to get PackRotamers to respect any restraints set on the
# command line, so instead the restraints are set in the protocol itself.
big_jobs.run_rosetta(
workspace, job_info,
use_resfile=True,
)
big_jobs.debrief()
39 changes: 6 additions & 33 deletions pull_into_place/big_jobs/pip_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,11 @@
#$ -l netapp=1G
#$ -cwd

import os, sys, subprocess
from pull_into_place import big_jobs

workspace, job_id, task_id, parameters = big_jobs.initiate()

designs = parameters['inputs']
design = designs[task_id % len(designs)]
test_run = parameters.get('test_run', False)

big_jobs.print_debug_info()
big_jobs.run_command([
workspace.rosetta_scripts_path,
'-database', workspace.rosetta_database_path,
'-in:file:s', workspace.input_path(design),
'-in:file:native', workspace.input_pdb_path,
'-out:prefix', workspace.output_subdir(design) + '/',
'-out:suffix', '_{0:03d}'.format(task_id / len(designs)),
'-out:no_nstruct_label',
'-out:overwrite',
'-out:pdb_gz',
'-out:mute', 'protocols.loops.loops_main',
'-parser:protocol', workspace.validate_script_path,
'-parser:script_vars',
'wts_file=' + workspace.scorefxn_path,
'loop_file=' + workspace.loops_path,
'fast=' + ('yes' if test_run else 'no'),
'loop_start=' + str(workspace.loop_boundaries[0]),
'loop_end=' + str(workspace.loop_boundaries[1]),
'outputs_folder=' + workspace.seqprof_dir,
'design_number=' + design + '_{0:03d}'.format(task_id / len(designs)),
'vall_path=' + workspace.rosetta_vall_path(test_run),
'fragment_weights=' + workspace.fragment_weights_path,
] + workspace.fragments_flags(design) + [
'@', workspace.flags_path,
])
workspace, job_info = big_jobs.initiate()
big_jobs.run_rosetta(
workspace, job_info,
use_fragments=True,
)
big_jobs.debrief()
46 changes: 41 additions & 5 deletions pull_into_place/big_jobs/standard_params/filters.xml
Original file line number Diff line number Diff line change
@@ -1,8 +1,44 @@
<FILTERS>
<PackStat name="PackStat Score [[+]]" threshold="0"/>
<PreProline name="Pre-Proline Potential [[-]]" use_statistical_potential="true" />
<ExposedHydrophobics name="Exposed Hydrophobic Residue SASA [[-]]" sasa_cutoff="20" threshold="-1" />
<Foldability name="Foldability Score [[+]]" start_res="%%loop_start%%" end_res="%%loop_end%%" />
<FragmentScoreFilter name="[[-]]Max 9-Residue Fragment Crmsd" scoretype="FragmentCrmsd" sort_by="FragmentCrmsd" threshold="9999" direction="-" start_res="%%loop_start%%" end_res="%%loop_end%%" compute="maximum" outputs_folder="%%outputs_folder%%" outputs_name="%%design_number%%" csblast="/netapp/home/krivacic/software/csblast-2.2.3_linux64" blast_pgp="/netapp/home/klabqb3backrub/tools/blast-2.2.26/bin/blastpgp" placeholder_seqs="/netapp/home/xingjiepan/Databases/BLAST/placeholder/placeholder_seqs" psipred="/netapp/home/xingjiepan/Softwares/parametric_scaffold_design/dependencies/dependencies/psipred/runpsipred_single" sparks-x="/netapp/home/klabqb3backrub/tools/sparks-x" sparks-x_query="/netapp/home/klabqb3backrub/tools/sparks-x/bin/buildinp_query.sh" frags_scoring_config="%%fragment_weights%%" n_frags="200" n_candidates="1000" fragment_size="9" vall_path="%%vall_path%%" />
<PackStat
name="PackStat Score [[+]]"
threshold="0"
/>
<PreProline
name="Pre-Proline Potential [[-]]"
use_statistical_potential="true"
/>
<ExposedHydrophobics
name="Exposed Hydrophobic Residue SASA [[-]]"
sasa_cutoff="20"
threshold="-1"
/>
<Foldability
name="Foldability Score [[+]]"
start_res="%%loop_start%%"
end_res="%%loop_end%%"
/>
<FragmentScoreFilter
name="Max 9-Residue Fragment Crmsd[[-]]"
scoretype="FragmentCrmsd"
sort_by="FragmentCrmsd"
threshold="9999"
direction="-"
start_res="%%loop_start%%"
end_res="%%loop_end%%"
compute="maximum"
outputs_folder="%%outputs_folder%%"
outputs_name="%%design_number%%"
csblast="/netapp/home/krivacic/software/csblast-2.2.3_linux64"
blast_pgp="/netapp/home/klabqb3backrub/tools/blast-2.2.26/bin/blastpgp"
placeholder_seqs="/netapp/home/xingjiepan/Databases/BLAST/placeholder/placeholder_seqs"
psipred="/netapp/home/xingjiepan/Softwares/parametric_scaffold_design/dependencies/dependencies/psipred/runpsipred_single"
sparks-x="/netapp/home/klabqb3backrub/tools/sparks-x"
sparks-x_query="/netapp/home/klabqb3backrub/tools/sparks-x/bin/buildinp_query.sh"
frags_scoring_config="%%fragment_weights%%"
n_frags="200"
n_candidates="1000"
fragment_size="9"
vall_path="%%vall_path%%"
/>

</FILTERS>
15 changes: 3 additions & 12 deletions pull_into_place/commands/07_setup_design_fragments.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,8 @@
Print out the command-line that would be used to generate fragments,
but don't actually run it.
-x, --clear
Remove any previously generated fragment files.
Simply rerun this command if some of your fragment generation jobs fail. By
default it will only submit jobs for inputs that are missing valid fragment
files. You can force the fragments to be regenerated from scratch by passing
the '--clear' flag.
Simply rerun this command if some of your fragment generation jobs fail. It
will only submit jobs for inputs that are missing valid fragment files.
"""

import subprocess
Expand All @@ -43,16 +38,12 @@ def main():
workspace.check_rosetta()
workspace.make_dirs()

# Do this before working out the 'klab_generate_fragments' command, because
# it may affect which inputs are picked.
if args['--clear'] and not args['--dry-run']:
workspace.clear_fragments()

generate_fragments = [
'klab_generate_fragments',
'--loops_file', workspace.loops_path,
'--outdir', workspace.fragments_dir,
'--memfree', args['--mem-free'],
'--overwrite',
] + pick_inputs(workspace)

if args['--dry-run']:
Expand Down

0 comments on commit b8b6a8d

Please sign in to comment.