In [1]:
import os
import sys
import numpy as np
import pandas as pd

In [9]:
# Fill in all of the score files you wish to draw your order from

score_files = [
    "paper_interface_design_production_combined.sc",
    "motif_graft_design/paper_interface_design_production_combined.sc"
]


dfs = []
for file in score_files:   
    print(file)
    asdf = pd.read_csv(file, sep="\s+")
    
    dfs.append(asdf)
    
df = pd.concat(dfs)

test/rifdock/paper_interface_design_production_combined.sc
test/rifdock/motif_design/paper_interface_design_production_combined.sc


In [50]:
def top_x_by_multiple(data, desired_num ):
    # creating lists where each design has it's rank within each term
    arg_sorted = np.argsort(data, axis=-1)
    ranked = np.zeros(arg_sorted.shape, np.int)
    for icol in range(len(ranked)):
        ranked[icol,arg_sorted[icol]] = np.arange(0, total, dtype=np.int)

    # first power of 2 where bigger_2 // 2 > total
    bigger_2 = 2**(np.floor(np.log(total)/np.log(2)).astype(int) + 2)
    assert( bigger_2 // 2 > total )

    percentiles = np.linspace(0, 1, bigger_2)

    space_size = bigger_2
    next_cut = bigger_2 // 2 - 1

    remaining = 0
    cutoff = total

    # binary search
    # We're trying to find a percentile where if we take everything better than
    #  that percentile from all categories, and look for members that exist in all
    #  lists, it's equal to the number of designs we want to keep
    while True:

        # this is the actual ranking process
        # take top X in each argsort and make sure they're in all top Xs
        eval_percentile = (percentiles[next_cut] + percentiles[next_cut+1]) / 2
        cutoff = eval_percentile * total

        mask = np.ones(total, np.bool)

        for icol in range(len(data)):
            mask &= ranked[icol] >= cutoff

        # end ranking


        remaining = mask.sum()

        # eprint(cutoff, next_cut, "/", bigger_2, remaining)

        if ( remaining == desired_num ):
            break

        space_size //= 2

        if ( space_size == 1 ):
            break

        if ( remaining < desired_num ):
            next_cut -= space_size // 2
        else:
            next_cut += space_size // 2
            
    print("Best effort == %ith percentile"%(eval_percentile*100))
            
    return mask


In [61]:

number_to_order = 5

# All of your designs must pass these cutoffs

hard_cuts = {               # cutoff, higher is better
    'ddg' :                     [ -30,  False],
    'contact_molecular_surface':[ 450,  True],
    'score_per_res' :           [-2.4, False],
    'mismatch_probability':     [ 0.1, False],
    'sap_score':                [  35, False],
    'binder_delta_sap':         [  12,  True],
#    'ss_sc':                    [0.77, True], # optional
}

# After the hard cuts. Find the best by the following metrics

best_effort = {             # higher is better
    'ddg':                         [  False],
    'contact_patch' :               [  True],
    'target_delta_sap':             [  True],
    'contact_molec_sq5_apap_target':[  True],
}


########### Don't change stuff below here #############


score_df = df

# Print the pass rates for each term
print("")
print("=========================== Hard cuts: ===========================")
score_df['orderable'] = True
for pilot_term in hard_cuts:
    cut, higher_better = hard_cuts[pilot_term]
    ok_term = pilot_term.replace("_pilot", "") + "_ok"
    if ( higher_better ):
        score_df[ok_term] = score_df[pilot_term] >= cut
    else:
        score_df[ok_term] = score_df[pilot_term] <= cut
    score_df['orderable'] &= score_df[ok_term]
    print("%30s: %7.2f -- %5.0f%% pass-rate"%(pilot_term, cut, score_df[ok_term].sum() / len(score_df) * 100))


total = score_df['orderable'].sum()
print("")
print("                         %s (%.1f%%) designs remain"%(total, total/len(score_df)*100))

if ( total < number_to_order ):
    print("You don't have enough designs! Either change your cutoffs or go make some more!")
    assert(False)

after_hard_cuts = score_df[score_df['orderable']]
data = np.zeros((len(best_effort), total))
for iterm, term in enumerate(best_effort):
    higher_better = best_effort[term][0]
    if ( higher_better ):
        data[iterm] = after_hard_cuts[term].values
    else:
        data[iterm] = -after_hard_cuts[term].values

print("")
is_in_the_top = top_x_by_multiple(data, number_to_order )
final_df = after_hard_cuts[is_in_the_top]
        
print("")
print("=========================== Best effort: ===========================")
for term in best_effort:
    higher_better = best_effort[term]
    if ( higher_better ):
        cut = final_df[term].min()
        oks = (after_hard_cuts[term] >= cut).sum()
    else:
        cut = final_df[term].max()
        oks = (after_hard_cuts[term] <= cut).sum()
    
    print("%30s: %7.2f -- %5.0f%% pass-rate"%(term, cut, oks / len(after_hard_cuts) * 100))


print("")
print("Final: %i designs"%(len(final_df)))


                           ddg:  -15.00 --    90% pass-rate
     contact_molecular_surface:  200.00 --    62% pass-rate
                 score_per_res:   -2.40 --    98% pass-rate
          mismatch_probability:    0.10 --    90% pass-rate
                     sap_score:   35.00 --   100% pass-rate
              binder_delta_sap:    4.00 --    68% pass-rate

                         81 (40.5%) designs remain

Best effort == 66th percentile

                           ddg:  -33.42 --   100% pass-rate
                 contact_patch:  217.83 --    21% pass-rate
              target_delta_sap:   18.03 --    11% pass-rate
 contact_molec_sq5_apap_target: 3651.43 --    30% pass-rate

Final: 4 designs


In [62]:
output_name = "to_order.list"

final_df[['description']].to_csv(output_name, index=None, header=None)