Merge pull request #16 from Kortemme-Lab/external_metrics

Refactor how jobs are run, and add external metrics.
Kortemme-Lab · Mar 16, 2018 · b8b6a8d · b8b6a8d
2 parents 0ca2667 + c22caf0
commit b8b6a8d
Show file tree

Hide file tree

Showing 7 changed files with 253 additions and 158 deletions.
diff --git a/pull_into_place/big_jobs.py b/pull_into_place/big_jobs.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python2
 
-import sys, os, re, json, time, subprocess
+import sys, os, re, json, subprocess, gzip
+from klab.process import tee
 from . import pipeline
 
 def submit(script, workspace, **params):
@@ -22,10 +23,9 @@ def submit(script, workspace, **params):
 
     if test_run:
         nstruct = 50
-        max_runtime = '2:00:00'
 
     if nstruct is None:
-        raise TypeError("sumbit() requires the keyword argument 'nstruct' for production runs.")
+        raise TypeError("submit() requires the keyword argument 'nstruct' for production runs.")
 
     # Submit the job and put it immediately into the hold state.
 
@@ -50,7 +50,7 @@ def submit(script, workspace, **params):
 
     job_id = status_match.group(1)
 
-    with open(workspace.job_params_path(job_id), 'w') as file:
+    with open(workspace.job_info_path(job_id), 'w') as file:
         json.dump(params, file)
 
     # Release the hold on the job.
@@ -61,20 +61,105 @@ def submit(script, workspace, **params):
 
 def initiate():
     """Return some relevant information about the currently running job."""
+    print_debug_header()
+
     workspace = pipeline.workspace_from_dir(sys.argv[1])
     workspace.cd_to_root()
 
-    job_id = int(os.environ['JOB_ID'])
-    task_id = int(os.environ['SGE_TASK_ID']) - 1
-    job_params = read_params(workspace.job_params_path(job_id))
+    job_info = read_job_info(workspace.job_info_path(os.environ['JOB_ID']))
+    job_info['job_id'] = int(os.environ['JOB_ID'])
+    job_info['task_id'] = int(os.environ['SGE_TASK_ID']) - 1
+
+    return workspace, job_info
+
+def debrief():
+    """
+    Report the amount of memory used by this job, among other things.
+    """
+    job_number = os.environ['JOB_ID'] + '.' + os.environ['SGE_TASK_ID']
+    run_command(['/usr/local/sge/bin/linux-x64/qstat', '-j', job_number])
+
+def run_rosetta(workspace, job_info, 
+        use_resfile=False, use_restraints=False, use_fragments=False):
+
+    test_run = job_info.get('test_run', False)
+    rosetta_cmd = [
+        workspace.rosetta_scripts_path,
+        '-database', workspace.rosetta_database_path,
+        '-in:file:s', workspace.input_path(job_info),
+        '-in:file:native', workspace.input_path(job_info),
+        '-out:prefix', workspace.output_prefix(job_info),
+        '-out:suffix', workspace.output_suffix(job_info),
+        '-out:no_nstruct_label',
+        '-out:overwrite',
+        '-out:pdb_gz',
+        '-out:mute', 'protocols.loops.loops_main',
+        '-parser:protocol', workspace.protocol_path,
+        '-parser:script_vars',
+            'wts_file=' + workspace.scorefxn_path,
+            'cst_file=' + workspace.restraints_path,
+            'loop_file=' + workspace.loops_path,
+            'loop_start=' + str(workspace.loop_boundaries[0]),
+            'loop_end=' + str(workspace.loop_boundaries[1]),
+            'outputs_folder=' + workspace.seqprof_dir,
+            'design_number=' + workspace.output_basename(job_info),
+            'vall_path=' + workspace.rosetta_vall_path(test_run),
+            'fragment_weights=' + workspace.fragment_weights_path,
+            'fast=' + ('yes' if test_run else 'no'),
+    ]
+    if use_resfile: rosetta_cmd += [
+        '-packing:resfile', workspace.resfile_path,
+    ]
+    if use_restraints: rosetta_cmd += [
+        '-constraints:cst_fa_file', workspace.restraints_path,
+    ]
+    if use_fragments: rosetta_cmd += \
+        workspace.fragments_flags(workspace.input_path(job_info))
+
+    rosetta_cmd += [
+        '@', workspace.flags_path,
+    ]
+
+    run_command(rosetta_cmd)
+    run_external_metrics(workspace, job_info)
+
+def run_external_metrics(workspace, job_info):
+    pdb_path = workspace.output_path(job_info)
+
+    for metric in workspace.metric_scripts:
+        command = metric, pdb_path
+
+        print "Working directory:", os.getcwd()
+        print "Command:", ' '.join(command)
+        sys.stdout.flush()
+
+        stdout, stderr = tee([metric, pdb_path])
+        file = gzip.open(pdb_path, 'a')
+
+        for line in stdout.strip().split('\n'):
+            if line.strip():
+                file.write('EXTRA_METRIC {0}\n'.format(line))
+
+        file.close()
+
+def run_command(command):
+    print "Working directory:", os.getcwd()
+    print "Command:", ' '.join(command)
+    sys.stdout.flush()
+
+    process = subprocess.Popen(command)
+
+    print "Process ID:", process.pid
+    print
+    sys.stdout.flush()
 
-    return workspace, job_id, task_id, job_params
+    process.wait()
 
-def read_params(params_path):
-    with open(params_path) as file:
+def read_job_info(json_path):
+    with open(json_path) as file:
         return json.load(file)
 
-def print_debug_info():
+def print_debug_header():
     from datetime import datetime
     from socket import gethostname
 
@@ -86,17 +171,5 @@ def print_debug_info():
     print
     sys.stdout.flush()
 
-def run_command(command):
-    print "Working directory:", os.getcwd()
-    print "Command:", ' '.join(command)
-    sys.stdout.flush()
 
-    process = subprocess.Popen(command)
-
-    print "Process ID:", process.pid
-    print
-    sys.stdout.flush()
-    process.wait()
-    jobnumber = os.environ['JOB_ID'] + '.' + os.environ['SGE_TASK_ID']
-    print 'Job Number:', jobnumber
-    subprocess.call(['/usr/local/sge/bin/linux-x64/qstat','-j',jobnumber])
+
diff --git a/pull_into_place/big_jobs/pip_build.py b/pull_into_place/big_jobs/pip_build.py
@@ -6,38 +6,13 @@
 #$ -l netapp=1G
 #$ -cwd
 
-import os, sys, subprocess
 from pull_into_place import big_jobs
 
-workspace, job_id, task_id, parameters = big_jobs.initiate()
-output_prefix = '{0}/{1}_{2:06d}_'.format(workspace.output_dir, job_id, task_id)
-test_run = parameters.get('test_run', False)
-
-big_jobs.print_debug_info()
-big_jobs.run_command([
-        workspace.rosetta_scripts_path,
-        '-database', workspace.rosetta_database_path,
-        '-in:file:s', workspace.input_pdb_path,
-        '-in:file:native', workspace.input_pdb_path,
-        '-out:prefix', output_prefix,
-        '-out:no_nstruct_label',
-        '-out:overwrite',
-        '-out:pdb_gz',
-        '-out:mute', 'protocols.loops.loops_main',
-        '-parser:protocol', workspace.build_script_path,
-        '-parser:script_vars',
-            'wts_file=' + workspace.scorefxn_path,
-            'cst_file=' + workspace.restraints_path,
-            'loop_file=' + workspace.loops_path,
-            'fast=' + ('yes' if test_run else 'no'),
-            'loop_start=' + str(workspace.loop_boundaries[0]),
-            'loop_end=' + str(workspace.loop_boundaries[1]),
-            'outputs_folder=' + workspace.seqprof_dir,
-            'design_number=' + '{0}_{1:06d}'.format(job_id,task_id),
-            'vall_path=' + (workspace.rosetta_vall_path(test_run)),
-            'fragment_weights=' + workspace.fragment_weights_path,
-        '-packing:resfile', workspace.resfile_path,
-        '-constraints:cst_fa_file', workspace.restraints_path,
-] +     workspace.fragments_flags(workspace.input_pdb_path) + [
-        '@', workspace.flags_path,
-])
+workspace, job_info = big_jobs.initiate()
+big_jobs.run_rosetta(
+        workspace, job_info,
+        use_resfile=True,
+        use_restraints=True,
+        use_fragments=True,
+)
+big_jobs.debrief()
diff --git a/pull_into_place/big_jobs/pip_design.py b/pull_into_place/big_jobs/pip_design.py
@@ -7,38 +7,14 @@
 #$ -l h_core=0
 #$ -cwd
 
-
-import os, sys, subprocess
 from pull_into_place import big_jobs
 
-workspace, job_id, task_id, parameters = big_jobs.initiate()
-test_run = parameters.get('test_run', False)
-
-bb_models = parameters['inputs']
-bb_model = bb_models[task_id % len(bb_models)]
-design_id = task_id // len(bb_models)
+workspace, job_info = big_jobs.initiate()
 
-big_jobs.print_debug_info()
-big_jobs.run_command([
-        workspace.rosetta_scripts_path,
-        '-database', workspace.rosetta_database_path,
-        '-in:file:s', workspace.input_path(bb_model),
-        '-in:file:native', workspace.input_pdb_path,
-        '-out:prefix', workspace.output_dir + '/',
-        '-out:suffix', '_{0:03}'.format(design_id),
-        '-out:no_nstruct_label',
-        '-out:overwrite',
-        '-out:pdb_gz',
-        '-parser:protocol', workspace.design_script_path,
-        '-parser:script_vars',
-            'wts_file=' + workspace.scorefxn_path,
-            'cst_file=' + workspace.restraints_path,
-            'loop_start=' + str(workspace.loop_boundaries[0]),
-            'loop_end=' + str(workspace.loop_boundaries[1]),
-            'outputs_folder=' + workspace.seqprof_dir,
-            'design_number=' + bb_model + '_{0:03}'.format(design_id),
-            'vall_path=' + (workspace.rosetta_vall_path(test_run)),
-            'fragment_weights=' + workspace.fragment_weights_path,
-        '-packing:resfile', workspace.resfile_path,
-        '@', workspace.flags_path,
-])
+# I wasn't able to get PackRotamers to respect any restraints set on the 
+# command line, so instead the restraints are set in the protocol itself.
+big_jobs.run_rosetta(
+        workspace, job_info,
+        use_resfile=True,
+)
+big_jobs.debrief()
diff --git a/pull_into_place/big_jobs/pip_validate.py b/pull_into_place/big_jobs/pip_validate.py
@@ -6,38 +6,11 @@
 #$ -l netapp=1G
 #$ -cwd
 
-import os, sys, subprocess
 from pull_into_place import big_jobs
 
-workspace, job_id, task_id, parameters = big_jobs.initiate()
-
-designs = parameters['inputs']
-design = designs[task_id % len(designs)]
-test_run = parameters.get('test_run', False)
-
-big_jobs.print_debug_info()
-big_jobs.run_command([
-        workspace.rosetta_scripts_path,
-        '-database', workspace.rosetta_database_path,
-        '-in:file:s', workspace.input_path(design),
-        '-in:file:native', workspace.input_pdb_path,
-        '-out:prefix', workspace.output_subdir(design) + '/',
-        '-out:suffix', '_{0:03d}'.format(task_id / len(designs)),
-        '-out:no_nstruct_label',
-        '-out:overwrite',
-        '-out:pdb_gz',
-        '-out:mute', 'protocols.loops.loops_main',
-        '-parser:protocol', workspace.validate_script_path,
-        '-parser:script_vars',
-            'wts_file=' + workspace.scorefxn_path,
-            'loop_file=' + workspace.loops_path,
-            'fast=' + ('yes' if test_run else 'no'),
-            'loop_start=' + str(workspace.loop_boundaries[0]),
-            'loop_end=' + str(workspace.loop_boundaries[1]),
-            'outputs_folder=' + workspace.seqprof_dir, 
-            'design_number=' + design + '_{0:03d}'.format(task_id / len(designs)),
-            'vall_path=' + workspace.rosetta_vall_path(test_run),
-            'fragment_weights=' + workspace.fragment_weights_path,
-] +     workspace.fragments_flags(design) + [
-        '@', workspace.flags_path,
-])
+workspace, job_info = big_jobs.initiate()
+big_jobs.run_rosetta(
+        workspace, job_info,
+        use_fragments=True,
+)
+big_jobs.debrief()
diff --git a/pull_into_place/big_jobs/standard_params/filters.xml b/pull_into_place/big_jobs/standard_params/filters.xml
@@ -1,8 +1,44 @@
 <FILTERS>
-  <PackStat name="PackStat Score [[+]]" threshold="0"/>
-  <PreProline name="Pre-Proline Potential [[-]]" use_statistical_potential="true" />
-  <ExposedHydrophobics name="Exposed Hydrophobic Residue SASA [[-]]" sasa_cutoff="20" threshold="-1" />
-  <Foldability name="Foldability Score [[+]]" start_res="%%loop_start%%" end_res="%%loop_end%%" />
-  <FragmentScoreFilter name="[[-]]Max 9-Residue Fragment Crmsd" scoretype="FragmentCrmsd" sort_by="FragmentCrmsd" threshold="9999" direction="-" start_res="%%loop_start%%" end_res="%%loop_end%%" compute="maximum" outputs_folder="%%outputs_folder%%" outputs_name="%%design_number%%" csblast="/netapp/home/krivacic/software/csblast-2.2.3_linux64"  blast_pgp="/netapp/home/klabqb3backrub/tools/blast-2.2.26/bin/blastpgp" placeholder_seqs="/netapp/home/xingjiepan/Databases/BLAST/placeholder/placeholder_seqs" psipred="/netapp/home/xingjiepan/Softwares/parametric_scaffold_design/dependencies/dependencies/psipred/runpsipred_single" sparks-x="/netapp/home/klabqb3backrub/tools/sparks-x" sparks-x_query="/netapp/home/klabqb3backrub/tools/sparks-x/bin/buildinp_query.sh" frags_scoring_config="%%fragment_weights%%" n_frags="200" n_candidates="1000" fragment_size="9"  vall_path="%%vall_path%%" />
+  <PackStat
+    name="PackStat Score [[+]]"
+    threshold="0"
+  />
+  <PreProline
+    name="Pre-Proline Potential [[-]]"
+    use_statistical_potential="true"
+  />
+  <ExposedHydrophobics
+    name="Exposed Hydrophobic Residue SASA [[-]]"
+    sasa_cutoff="20"
+    threshold="-1"
+  />
+  <Foldability
+    name="Foldability Score [[+]]"
+    start_res="%%loop_start%%"
+    end_res="%%loop_end%%"
+  />
+  <FragmentScoreFilter
+    name="Max 9-Residue Fragment Crmsd[[-]]"
+    scoretype="FragmentCrmsd"
+    sort_by="FragmentCrmsd"
+    threshold="9999" 
+    direction="-"
+    start_res="%%loop_start%%"
+    end_res="%%loop_end%%" 
+    compute="maximum"
+    outputs_folder="%%outputs_folder%%" 
+    outputs_name="%%design_number%%" 
+    csblast="/netapp/home/krivacic/software/csblast-2.2.3_linux64"  
+    blast_pgp="/netapp/home/klabqb3backrub/tools/blast-2.2.26/bin/blastpgp" 
+    placeholder_seqs="/netapp/home/xingjiepan/Databases/BLAST/placeholder/placeholder_seqs" 
+    psipred="/netapp/home/xingjiepan/Softwares/parametric_scaffold_design/dependencies/dependencies/psipred/runpsipred_single" 
+    sparks-x="/netapp/home/klabqb3backrub/tools/sparks-x" 
+    sparks-x_query="/netapp/home/klabqb3backrub/tools/sparks-x/bin/buildinp_query.sh" 
+    frags_scoring_config="%%fragment_weights%%"
+    n_frags="200"
+    n_candidates="1000" 
+    fragment_size="9"
+    vall_path="%%vall_path%%"
+  />
 
 </FILTERS>
diff --git a/pull_into_place/commands/07_setup_design_fragments.py b/pull_into_place/commands/07_setup_design_fragments.py
@@ -20,13 +20,8 @@
         Print out the command-line that would be used to generate fragments, 
         but don't actually run it.
 
-    -x, --clear
-        Remove any previously generated fragment files.
-
-Simply rerun this command if some of your fragment generation jobs fail.  By 
-default it will only submit jobs for inputs that are missing valid fragment 
-files.  You can force the fragments to be regenerated from scratch by passing 
-the '--clear' flag.
+Simply rerun this command if some of your fragment generation jobs fail.  It 
+will only submit jobs for inputs that are missing valid fragment files.
 """
 
 import subprocess
@@ -43,16 +38,12 @@ def main():
     workspace.check_rosetta()
     workspace.make_dirs()
 
-    # Do this before working out the 'klab_generate_fragments' command, because 
-    # it may affect which inputs are picked.
-    if args['--clear'] and not args['--dry-run']:
-        workspace.clear_fragments()
-
     generate_fragments = [
             'klab_generate_fragments',
             '--loops_file', workspace.loops_path,
             '--outdir', workspace.fragments_dir,
             '--memfree', args['--mem-free'],
+            '--overwrite',
     ] +     pick_inputs(workspace)
 
     if args['--dry-run']: