# Make a new set of decoys for PPI decoy discrimination

## Import `Python` modules

In [1]:
import os
import sys
import subprocess
import pandas
import glob
import random
import math
import time
import numpy as np
import scipy.stats
import umap
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(font_scale=2, style='ticks', palette='colorblind')

# custom modules
sys.path.append('scripts/')
import design_utils

resultsdir = '/net/scratch/haddox/2021/ppi_benchmark/results'
if not os.path.isdir(resultsdir):
    os.makedirs(resultsdir)

## Make a list of natives to take through the pipeline

In [2]:
# Make a list of all natives
all_natives = glob.glob('data/natives/*.pdb')
all_natives = [os.path.basename(x)[:4] for x in all_natives]
print(f"all natives: {len(all_natives)}")

# Make a list of the 100 natives Hahnbeom and Frank
# originally considered for training
natives_100 = glob.glob('/home/dimaio/optE2/dualoptE/decoys/docking/*.pdb')
natives_100 = [os.path.basename(x)[:4] for x in natives_100]


# Make a list of natives used in training
alljobs = '/home/haddox/2019/optE_eval/alljobs'
with open(alljobs) as f:
    lines = f.readlines()
training_natives = []
for line in lines:
    if './run_docking_single.sh' in line:
        (cmd, native) = line.strip().split()
        training_natives.append(native)
print(f"training natives: {len(training_natives)}")

# Validation natives
validation_natives = [
    native for native in all_natives
    if native not in training_natives
]
new_validation_natives = [
    native for native in validation_natives
    if native not in natives_100
]
print(f"all validation natives: {len(validation_natives)}")
print(f"new validation natives: {len(new_validation_natives)}")

all natives: 176
training natives: 65
all validation natives: 111
new validation natives: 76


## Extract PDBs from silent files containing the output of Hahnbeom's global-docking runs

In [3]:
# Extract all PDBs from a silent file with output from
# global docking
for native in all_natives:
    if native in training_natives:
        continue
    silent_file = os.path.join(
        '/net/scratch/haddox/2021/ppi_benchmark/data/comb1000/',
        f'{native}.1000.rescore.out'
    )
    if not os.path.isfile(silent_file):
        continue
    output_dir = f'{resultsdir}/global_docks/{native}/'
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

        # Extract PDBs
        print(f'Extracting PDBs for {native}')
        design_utils.extract_pdbs_from_silent_file(
            working_dir=output_dir,
            silent_file=silent_file
        )

        # Copy the PDB of the native to the same directory as the
        # global docks
        print("Copying native PDB to same directory")
        native_pdb = f'data/natives/{native}_bound_native.pdb'
        new_pdb = os.path.join(output_dir, os.path.basename(native_pdb))
        ! cp {native_pdb} {new_pdb}
        time.sleep(60)

        # Strip PDBs of all hydrogens to avoid bug with hydrogen
        # placement from old silent files
        print("Stripping hydrogens from all PDBs")
        pdbs = glob.glob(os.path.join(output_dir, '*.pdb'))
        for pdb in pdbs:
            cmd = f'scripts/stripH_inplace.pl {pdb}'
            ! {cmd}
    else:
        pdbs = glob.glob(os.path.join(output_dir, '*.pdb'))
        #print(native, len(pdbs))

## Make jittered version of each global dock

In [4]:
# Define parameters for jittering inputs
run_dict = {
    1 : {
        'internal_sampes' : 25, # number of jitters to make
        'cluster_rmsd' : 0.2, # how tightly to cluster jitters
        'max_translate' : 2, # maximum translation distance
        'max_angle' : 1 # maximum angle for rotating things
    },
    2 : {
        'internal_sampes' : 25, # number of jitters to make
        'cluster_rmsd' : 0.2, # how tightly to cluster jitters
        'max_translate' : 2, # maximum translation distance
        'max_angle' : 10 # maximum angle for rotating things
    }
}

# Cycle over all input PDBs and jitter chain B
boinc_tag = 'HKH_2021'
for native in validation_natives: # natives:
    
    # I have already generated all jitters
    continue
    
    pdb_dir = f'{resultsdir}/global_docks/{native}/'
    pdbs = glob.glob(os.path.join(pdb_dir, '*.pdb'))
    if len(pdbs) == 0:
        continue
    file_listing_pdbs = os.path.join(pdb_dir, 'pdbs.txt')
    if not os.path.isfile(file_listing_pdbs):
        with open(file_listing_pdbs, 'w') as f:
            for pdb in pdbs:
                f.write(f'{pdb}\n')
    env_name = 'high_throughput_design_for_E_function_opt'
    runs = [1, 2]
    for run in runs:

        # Assemble command-line argument
        output_dir = os.path.join(
            pdb_dir,
            'jittered_poses',
            f'run_{run}/'
        )
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)
        cmd = ' '.join([
            'python',
            'scripts/jitter_binder.py',
            file_listing_pdbs,
            f'-output_dir {output_dir}',
            f'-output_pdb_prefix {boinc_tag}_{native}_run_{run}_',
            f"-internal_samples {run_dict[run]['internal_sampes']}",
            f"-cluster_rmsd {run_dict[run]['cluster_rmsd']}",
            f"-max_translate {run_dict[run]['max_translate']}",
            f"-max_angle {run_dict[run]['max_angle']}",
        ])
        cmd = 'source activate {0}'.format(env_name) + '\n' + cmd

        # Carry out the job
        sbatch_file_name = os.path.join(
            output_dir,
            'jitter_binder.sbatch'
        )
        if not os.path.isfile(sbatch_file_name):
            print(native, f'run_{run}', len(pdbs))
            design_utils.WriteSbatchFile(
                sbatch_file_name,
                command=cmd,
                queue_type='medium',
                memory='2g'
            )
            ! sbatch {sbatch_file_name}

## Use the beta_soft score function to score global docks before and after jittering to identify a subset to relax

Score global docks before jittering

In [5]:
# Score global docks before jittering
score_app_path = \
    '/home/haddox/Rosetta/main/source/bin/score.linuxgccrelease'
for native in validation_natives: # natives:
    
    # I have already scored all jitters
    continue
    
    pdb_dir = f'{resultsdir}/global_docks/{native}/'
    if not os.path.isdir(pdb_dir):
        continue
    pdbs = glob.glob(os.path.join(pdb_dir, '*.pdb'))
    file_listing_pdbs = os.path.join(pdb_dir, 'pdbs.txt')
    if not os.path.isfile(file_listing_pdbs):
        with open(file_listing_pdbs, 'w') as f:
            for pdb in pdbs:
                f.write(f'{pdb}\n')
    extra_args = ['-beta']
    output_dir = pdb_dir
    scores_file_prefix = os.path.join(output_dir, 'soft_rep_score')
    scores_sbatch_file = scores_file_prefix + '.sbatch'
    if not os.path.isfile(scores_sbatch_file):
        design_utils.compute_score_using_rosettascripts(
            score_app_path=score_app_path,
            file_listing_pdbs=file_listing_pdbs,
            weights_file='beta_soft',
            extra_args=extra_args,
            output_dir=output_dir,
            scores_file_prefix=scores_file_prefix,
            submit_sbatch_job=True,
            queue_type='long',
            memory='2g'
        )

    # And score docks after jittering
    for run in [1, 2]:
        run_dir = os.path.join(
            pdb_dir,
            'jittered_poses',
            f'run_{run}/'
        )
        pdbs = glob.glob(os.path.join(run_dir, '*.pdb'))
        if len(pdbs) == 0:
            continue
        file_listing_pdbs = os.path.join(run_dir, 'pdbs.txt')
        if not os.path.isfile(file_listing_pdbs):
            with open(file_listing_pdbs, 'w') as f:
                for pdb in pdbs:
                    f.write(f'{pdb}\n')
        extra_args = ['-beta']
        output_dir = run_dir
        scores_file_prefix = os.path.join(output_dir, 'soft_rep_score')
        scores_sbatch_file = scores_file_prefix + '.sbatch'
        if not os.path.isfile(scores_sbatch_file):
            design_utils.compute_score_using_rosettascripts(
                score_app_path=score_app_path,
                file_listing_pdbs=file_listing_pdbs,
                weights_file='beta_soft',
                extra_args=extra_args,
                output_dir=output_dir,
                scores_file_prefix=scores_file_prefix,
                submit_sbatch_job=True,
                queue_type='long',
                memory='2g'
            )

Read in scores from above and pick the 5 best-scoring jitters per decoy.

In [6]:
# Cycle through natives one at a time
for native in new_validation_natives:

    # I have already selected best-scoring jitters
    continue
    
    # Read in scores for global docks before jittering
    dfs = []
    score_file = f'{resultsdir}/global_docks/{native}/soft_rep_score.sc'
    if not os.path.isfile(score_file):
        print(f"Missing scores for {native}")
        continue
    df = pandas.read_csv(score_file, sep='\s+')
    df.dropna(subset=['description'], inplace=True)
    df['score'] = df['score'].astype(float)
    df['pdb'] = df['description'].apply(
        lambda x: x[:-5] + '.pdb'
    )
    df['native'] = native
    df['global_dock_parent'] = df['pdb'].apply(
        lambda x: os.path.basename(x).replace('.pdb', '')
    )
    df['original_dock'] = True
    del df['SCORE:']
    dfs.append(df)

    # Read in scores for global docks after jittering
    jitter_dir = f'{resultsdir}/global_docks/{native}/jittered_poses/'
    score_files = glob.glob(os.path.join(
        jitter_dir,
        'run*/soft_rep_score.sc'
    ))
    for score_file in score_files:
        df = pandas.read_csv(score_file, sep='\s+')
        df.dropna(subset=['description'], inplace=True)
        df['score'] = df['score'].astype(float)
        df['pdb'] = df['description'].apply(
            lambda x: x[:-5] + '.pdb'
        )
        df['native'] = native
        df['global_dock_parent'] = df['pdb'].str.extract(
            r'.+HKH_2021_\w+_run_[\d]_(.+)_chain_.+pdb'.format(native)
        )
        df['original_dock'] = False
        del df['SCORE:']
        dfs.append(df)
    soft_scores_df = pandas.concat(dfs, sort=False)
    soft_scores_df['pdb_bn'] = \
        soft_scores_df['pdb'].apply(os.path.basename)
    #soft_scores_df = soft_scores_df[
    #    ~soft_scores_df['global_dock_parent'].str.contains('_S_')
    #]
    del dfs
    if len(soft_scores_df) < 25000:
        print(native, len(soft_scores_df), 'continueing')
        continue
    print(native, len(soft_scores_df))

    # For each parent from the global dock, select a few
    # of the best-scoring structures for relaxing
    boinc_relax_dir = f'{resultsdir}/global_docks/{native}/boinc_relax/'
    home_boinc_relax_dir = boinc_relax_dir.replace(
        '/net/scratch/', '/home/'
    )
    for d in [boinc_relax_dir, home_boinc_relax_dir]:
        if not os.path.isdir(d):
            os.makedirs(d)
    parents = list(set(soft_scores_df['global_dock_parent']))
    soft_scores_df.sort_values('score', inplace=True)
    n_candidates_per_parent = 5
    for parent in parents:

        # Make a list of the best one or few of the best-scoring
        # decoys
        pdbs = list(soft_scores_df[
            soft_scores_df['global_dock_parent'] == parent
        ]['pdb'][:n_candidates_per_parent])

        # Copy PDBs to a new directory
        for pdb in pdbs:
            assert os.path.isfile(pdb), pdb

            # Copy PDB to directory on /net/scratch
            pdb_bn = os.path.basename(pdb)
            for d in [boinc_relax_dir]: # home_boinc_relax_dir
                if 'jittered_poses' in pdb:
                    new_pdb = os.path.join(d, pdb_bn)
                else:
                    new_pdb = os.path.join(
                        d,
                        f'{boinc_tag}_{native}_{pdb_bn}'
                    )
                if not os.path.isfile(new_pdb):
                    ! cp {pdb} {new_pdb}

    # Make 10 copies of the native PDB
    native_pdb = f'data/natives/{native}_bound_native.pdb'
    for i in range(10):
        pdb_bn = os.path.basename(native_pdb).replace(
            '.pdb', f'_{i}_xtal.pdb'
        )
        for d in [boinc_relax_dir, home_boinc_relax_dir]:
            new_pdb = os.path.join(
                d,
                f'{boinc_tag}_{pdb_bn}'
            )
            if not os.path.isfile(new_pdb):
                ! cp {native_pdb} {new_pdb}

Submit jobs to boinc

In [7]:
for native in new_validation_natives:

    # Get inputs
    boinc_relax_dir = f'{resultsdir}/global_docks/{native}/boinc_relax/'
    flags_file = 'scripts/HKH_2021_boinc_ppi_relax_flags.txt'
    flags_file_bn = os.path.basename(flags_file)
    new_flags_file = os.path.join(boinc_relax_dir, flags_file_bn)
    if not os.path.isfile(new_flags_file):
        print(new_flags_file)
        ! cp {flags_file} {new_flags_file}
    pdbs = glob.glob(os.path.join(boinc_relax_dir, '*.pdb'))
    if len(pdbs) == 0:
        print(f'Missing PDBs for {native}')
        continue

    # Cycle through PDBs and submit relax job for each one
    print(native, len(pdbs))
    for (i, pdb) in enumerate(pdbs):

        if i % 1000 == 0:
            print(i)

        # Make a submission file
        inputfile = os.path.basename(pdb)
        name = inputfile.replace('.pdb', '')
        resultfile = 'default.out.gz'
        boinc_submit_template = 'scripts/boinc_submit.txt'
        with open(boinc_submit_template) as f:
            protocol = f.read()
        protocol = protocol.replace('%%name%%', name)
        protocol = protocol.replace('%%description%%', name)
        protocol = protocol.replace('%%inputfile%%', inputfile)
        protocol = protocol.replace('%%resultfile%%', resultfile)
        boinc_submit_file = os.path.join(
            boinc_relax_dir,
            f'{name}.boinc_submit.txt'
        )
        if not os.path.isfile(boinc_submit_file):
            with open(boinc_submit_file, 'w') as f:
                f.write(protocol)

            # Submit job to boinc
            cmd = ' '.join([
                '/projects/boinc/bin/boinc_submit',
                os.path.basename(boinc_submit_file)
            ])
            boinc_job_outfile = \
                boinc_submit_file.replace('.txt', '.out')
            process = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                cwd=boinc_relax_dir,
                shell=True
            )
            (out, err) = process.communicate()
            with open(boinc_job_outfile, 'wb') as f:
                f.write(out)
            if err:
                print(err)
                raise ValueError('Error\n{0}'.format(err))

1JTG 5010
0
1000
2000
3000
4000
5000
1Z0K 5010
0
1000
2000
3000
4000
5000
1FCC 5010
0
1000
2000
3000
4000
5000
1GHQ 5010
0
1000
2000
3000
4000
5000
Missing PDBs for 1N2C
1HE8 5010
0
1000
2000
3000
4000
5000
1HIA 5010
0
1000
2000
3000
4000
5000
1ACB 5010
0
1000
2000
3000
4000
5000
1R8S 5010
0
1000
2000
3000
4000
5000
1PVH 5010
0
1000
2000
3000
4000
5000
1KXP 5010
0
1000
2000
3000
4000
5000
1OFU 5010
0
1000
2000
3000
4000
5000
1JPS 5010
0
1000
2000
3000
4000
5000
1E6J 5010
0
1000
2000
3000
4000
5000
4CPA 5010
0
1000
2000
3000
4000
5000
1NW9 5010
0
1000
2000
3000
4000
5000
1K4C 5010
0
1000
2000
3000
4000
5000
1VFB 5010
0
1000
2000
3000
4000
5000
1AK4 5010
0
1000
2000
3000
4000
5000
1LFD 5010
0
1000
2000
3000
4000
5000
1XD3 5010
0
1000
2000
3000
4000
5000
1EER 5010
0
1000
2000
3000
4000
5000
1CGI 5010
0
1000
2000
3000
4000
5000
Missing PDBs for 1BGX
1I4D 5010
0
1000
2000
3000
4000
5000
2FJU 5010
0
1000
2000
3000
4000
5000
1ML0 5010
0
1000
2000
3000
4000
5000
2IDO 5010
0
1000
2000
3000
4000