# Prepare the DUDE database for analysis

Perform preprocessing on the DUDE database.  Assumes that the DUDE database has been downloaded and placed in the path "data/dude/unprocessed". Here, we expect directories for each target containing 'actives_final.ism' and 'decoys_final.ism'.
It may be beneficial to split the below out of this Jupyter notebook and deploy to HPC or a non interactive environment. To pre-cache descriptors, run cache_mols_from_csv() on the DudePreprocessor object below.

The step may also be skipped by downloading the OpenFEPOPS-DUDE data directory created for analysis of the DUDE diversity set from FigShare at https://doi.org/10.6084/m9.figshare.23951445.v3.  With the data files in this archive unzipped to data/dude/processed/diversity_set/, you can skip the lengthy scoring and use intermediate files to generate the final AUROC table by executing code under the heading "Output summary AUROC results table as shown in the paper"

In [2]:
from fepops.utils import DudePreprocessor
dude_preprocessor=DudePreprocessor()
dude_preprocessor()

data/dude/
Processing the following DUDE targets: []


Preparing targets: 0it [00:00, ?it/s]


# Analyse the DUDE diversity set and obtain mean and standard deviations for each 'standard' set of FEPOPS descriptors

Below, we read in the DUDE diversity set and gather the mean and standard deviation of the produced FEPOPS descriptors and use these values as defaults for scaling (before scoring) all FEPOPS descriptors. Before running this, there should be 8 targets:

* akt1
* ampc
* cp3a4
* cxcr4
* gcr
* hivpr
* hivrt
* kif11

represented by their associated .csv and .db files present in the data/dude/processed/diversity_set/ directory

In [3]:
from pathlib import Path
from tqdm import tqdm
from fepops.fepops_persistent import get_persistent_fepops_storage_object
import numpy as np
dude_diversity_set_path=Path("data/dude/processed/diversity_set/")
diversity_target_files=list(dude_diversity_set_path.glob("dude_target_*.csv"))
descriptors=[]
for diversity_target in diversity_target_files:
    f=get_persistent_fepops_storage_object(diversity_target.with_suffix(".db"))
    print(f"Working on {diversity_target}")
    for (orig_smi, dude_id, chemblid, active_flag, can_smi) in tqdm([l.strip().split(",") for l in open(diversity_target).readlines()[1:] if len(l)>3]):
        status, retrieved_descriptors=f.get_fepops(can_smi)
        if status.value ==1:
            for d in retrieved_descriptors:
                descriptors.append(d)
            
descriptors=np.array(descriptors)
display("Mean:", descriptors.mean(axis=0))
display("Std:", descriptors.std(axis=0))

Working on data/dude/processed/diversity_set/dude_target_kif11.csv


 19%|█▉        | 1335/6966 [00:00<00:02, 2526.54it/s]

100%|██████████| 6966/6966 [00:02<00:00, 2563.43it/s]


Working on data/dude/processed/diversity_set/dude_target_hivrt.csv


100%|██████████| 19229/19229 [00:07<00:00, 2667.88it/s]


Working on data/dude/processed/diversity_set/dude_target_cxcr4.csv


100%|██████████| 3446/3446 [00:01<00:00, 2630.62it/s]


Working on data/dude/processed/diversity_set/dude_target_ampc.csv


100%|██████████| 2898/2898 [00:00<00:00, 2996.00it/s]


Working on data/dude/processed/diversity_set/dude_target_gcr.csv


 66%|██████▋   | 10130/15258 [00:04<00:01, 2578.91it/s][16:49:00] Explicit valence for atom # 12 N, 5, is greater than permitted
100%|██████████| 15258/15258 [00:06<00:00, 2447.16it/s]


Working on data/dude/processed/diversity_set/dude_target_cp3a4.csv


100%|██████████| 11970/11970 [00:04<00:00, 2422.45it/s]


Working on data/dude/processed/diversity_set/dude_target_akt1.csv


100%|██████████| 16743/16743 [00:06<00:00, 2430.18it/s]


Working on data/dude/processed/diversity_set/dude_target_hivpr.csv


100%|██████████| 36286/36286 [00:15<00:00, 2320.89it/s]


'Mean:'

array([-0.28971602,  0.5181022 ,  0.37487135,  0.99922747, -0.04187301,
        1.03382471,  0.27407036,  0.99853436,  0.09725517,  1.12824307,
        0.23735556,  0.99882914,  0.35977538,  0.66653514,  0.41238282,
        0.99902545,  5.71261449,  6.37716992,  6.47293777,  6.26134733,
        6.20354385,  6.23201498])

'Std:'

array([0.35110473, 1.00839329, 0.4838859 , 0.02769204, 0.15418035,
       0.86446056, 0.44583626, 0.0381767 , 0.16095862, 0.92079483,
       0.42526185, 0.03413741, 0.35756229, 1.36093993, 0.4921059 ,
       0.0311619 , 1.9668792 , 2.31266486, 2.50699385, 2.41269982,
       2.30018205, 2.31527129])

# Benchmark FEPOPS using the DUDE diversity set
Collect AUROC scores and compare against Morgan 2 and RDKit fingeprints
<hr>

Perform imports, define similarity methods (except OpenFEPOPS which will be defined upon target assessment in order to load cached descriptors)

In [4]:
import numpy as np
from tqdm import tqdm

from dataclasses import dataclass
from typing import Callable, Optional
from pathlib import Path
import pandas as pd
from rdkit import Chem

from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from sklearn.metrics import roc_auc_score
from fepops import OpenFEPOPS
from fepops.fepops_persistent import get_persistent_fepops_storage_object
from fepops.fepops import GetFepopStatusCode
import json

open_fepops_object=OpenFEPOPS()

@dataclass
class SimilarityMethod:
    name: str
    supports_multiple_candidates: bool
    descriptor_calc_func: Optional[Callable] = None
    descriptor_score_func: Optional[Callable] = None


# OpenFEPOPS will be added to this as a persistent object, reading from a DB of cached
# molecules for each target in the diversity set
similarity_methods = {
    'Morgan 2': SimilarityMethod(
        "Morgan 2",
        False,
        lambda x: AllChem.GetMorganFingerprint(x, 2),
        lambda x, y: DataStructs.TanimotoSimilarity(x,y),
    ),
    'MACCS': SimilarityMethod(
        "MACCS",
        False,
        lambda x: rdMolDescriptors.GetMACCSKeysFingerprint(x),
        lambda x, y: DataStructs.TanimotoSimilarity(x,y),
    ),
    'RDKit': SimilarityMethod(
        "RDKit",
        False,
        lambda x: Chem.RDKFingerprint(x,maxPath=4),
        lambda x, y: DataStructs.TanimotoSimilarity(x,y),
    ),
}

diversity_set_csv_files = list(Path("data/dude/processed/diversity_set/").glob("dude_target_*.csv"))
# Write all CSVs to SMILES files
print(f"Got {len(diversity_set_csv_files)} diversity_set_csv_files : {[f.stem for f in diversity_set_csv_files]}")
for csv_file_path in diversity_set_csv_files:
    pd.read_csv(csv_file_path,sep=",",
        index_col=[0],
        header=0,
        ).loc[:,['rdkit_canonical_smiles', 'DUDEID']].reset_index().to_csv(csv_file_path.with_suffix(".smi"), sep=" ", index=None, header=False)
    

Got 8 diversity_set_csv_files : ['dude_target_kif11', 'dude_target_hivrt', 'dude_target_cxcr4', 'dude_target_ampc', 'dude_target_gcr', 'dude_target_cp3a4', 'dude_target_akt1', 'dude_target_hivpr']


Perform AUROC score calculation

In [5]:
diversity_set_csv_dir = diversity_set_csv_files[0].parent
results_files_derived_from_diversity_set_csv_files = [
    f.parent / Path(f"res_df_{f.stem}.csv") for f in diversity_set_csv_files
]

existing_results_files_derived_from_diversity_set_csv_files = [
    f for f in results_files_derived_from_diversity_set_csv_files if f.exists()
]

if len(existing_results_files_derived_from_diversity_set_csv_files)>0:
    raise FileExistsError(
        f"Found processed results files in the {existing_results_files_derived_from_diversity_set_csv_files[0].parent} directory, remove res_df_dude_target_*.csv\nFound files were {existing_results_files_derived_from_diversity_set_csv_files} \nPlease remove"
    )

for csv_file_path in diversity_set_csv_files:
    auroc_scores_info_df = pd.DataFrame()

    # We replace the OpenFepops similarity object at each new diversity CSV file
    # so that new databases may be loaded for speed of descriptor retrieval.
    ofepops_persistent = get_persistent_fepops_storage_object(
        csv_file_path.with_suffix(".db")
    )
    similarity_methods['OpenFEPOPS'] = SimilarityMethod(
        "OpenFEPOPS",
        True,
        lambda x: ofepops_persistent.get_fepops(x, is_canonical=True),
        ofepops_persistent.calc_similarity,
    )

    df = pd.read_csv(
        csv_file_path,
        sep=",",
        index_col=[0],
        header=0,
    ).reset_index()
    print(df.head())
    descriptors = {k: [] for k in similarity_methods.keys()}
    smiles_list = df['rdkit_canonical_smiles'].tolist()
    labels_list = df['Active'].astype(int).tolist()
    problematic_compound_indexes = []
    for sm_name, sm in similarity_methods.items():
        # Cache all descriptors for each molecular similarity technique, as some mols may be bad and have to be removed
        for smiles_i, smiles in tqdm(
            enumerate(smiles_list),
            desc=f"Caching {sm_name} descriptors for {csv_file_path.stem}",
        ):
            mol_from_smiles = open_fepops_object._mol_from_smiles(smiles)
            if mol_from_smiles is None:
                problematic_compound_indexes.append(smiles_i)
                descriptors[sm_name].append(np.nan)
            else:
                res = sm.descriptor_calc_func(mol_from_smiles)
                if isinstance(res, tuple):
                    if (
                        res[0] == GetFepopStatusCode.FAILED_RETRIEVED_NONE
                        or res[0] == GetFepopStatusCode.FAILED_TO_GENERATE
                        or res[0] == GetFepopStatusCode.FAILED_TO_RETRIEVE
                        or res[1] is None
                    ):
                        print(f"Problem with {smiles}, {res}")
                        problematic_compound_indexes.append(smiles_i)
                        descriptors[sm_name].append(np.nan)
                    else:
                        descriptors[sm_name].append(res[-1])
                else:
                    if res is None:
                        problematic_compound_indexes.append(smiles_i)
                    descriptors[sm_name].append(res)
    # Remove failed molecules from pool of descriptors and labels
    for k, v in descriptors.items():
        descriptors[k] = [
            v[ii] for ii in range(len(v)) if ii not in problematic_compound_indexes
        ]
    labels_list = [
        labels_list[ii]
        for ii in range(len(labels_list))
        if ii not in problematic_compound_indexes
    ]
    auroc_scores = {smn: [] for smn in similarity_methods.keys()}

    info_dict = {}
    for sm_name, sm in similarity_methods.items():
        # Remove entries which did not return a mol
        info_dict['target'] = csv_file_path.stem.replace("dude_target_", "")
        info_dict['similarity_method'] = sm_name
        info_dict['smiles_count'] = len(smiles_list)
        info_dict['actives_count'] = np.sum(labels_list)
        info_dict['failed_smiles'] = len(problematic_compound_indexes)
        info_dict['failed_active_smiles'] = len(
            [ft for ft in problematic_compound_indexes if labels_list[ft] == 1]
        )
        for active_i in tqdm(
            np.argwhere(np.array(labels_list) == 1).flatten(),
            desc=f"Assessing active recall (AUROC) for {sm.name}",
        ):
            if sm.supports_multiple_candidates:
                scores = np.array(
                    sm.descriptor_score_func(
                        descriptors[sm_name][active_i], descriptors[sm_name]
                    ),
                    dtype=float,
                ).flatten()
            else:
                scores = np.array(
                    [
                        sm.descriptor_score_func(
                            descriptors[sm_name][active_i],
                            descriptors[sm_name][smiles_i],
                        )
                        for smiles_i in range(len(descriptors[sm_name]))
                    ],
                    dtype=float,
                ).flatten()

            auroc_scores[sm_name].append(
                roc_auc_score(
                    np.array(labels_list)[np.argwhere(~np.isnan(scores))],
                    scores[np.argwhere(~np.isnan(scores))],
                )
            )
        info_dict['average_auroc_score'] = np.mean(auroc_scores[sm_name])
        info_dict['median_auroc_score'] = np.median(auroc_scores[sm_name])
        info_dict['q1_auroc_score'] = np.percentile(auroc_scores[sm_name], 0.25)
        info_dict['q3_auroc_score'] = np.percentile(auroc_scores[sm_name], 0.75)
        auroc_scores_info_df = pd.concat(
            [auroc_scores_info_df, pd.DataFrame.from_dict({k:[v] for k,v in info_dict.items()})], ignore_index=True, axis=0
        )
    print("Writing to ", csv_file_path.parent)

    json.dump(
        auroc_scores,
        open(csv_file_path.parent / Path(f"res_scores_{csv_file_path.stem}.json"), "w"),
    )
    auroc_scores_info_df.to_csv(
        csv_file_path.parent / Path(f"res_df_{csv_file_path.stem}.csv")
    )
print(auroc_scores_info_df)

                                              SMILES  DUDEID      CHEMBLID  \
0  CN(C)CCC[C@@]3(c1ccccc1)CN(c2cc(C)ccc2F)N=C3C(...  415211  CHEMBL247044   
1  CC(=O)C3=NN1[C@@H](COc2ccc(F)cc12)[C@@]3(CCCN4...  414282  CHEMBL398478   
2  N[C@@H](C1CC1)C(=O)N3CC(c2cc(F)ccc2F)=C[C@H]3c...  344118  CHEMBL206115   
3  CC(=O)C3=NN1[C@@H](COc2ccc(F)cc12)[C@@]3(CCCN)...  414288  CHEMBL250127   
4  CC(=O)N2N=C(c1cc(F)ccc1F)C[C@@]2(C[C@H](F)CN)c...  402131  CHEMBL400042   

   Active                             rdkit_canonical_smiles  
0       1  CC(=O)C1=NN(c2cc(C)ccc2F)C[C@@]1(CCCN(C)C)c1cc...  
1       1  CC(=O)C1=NN2c3cc(F)ccc3OC[C@H]2[C@@]1(CCCN1CCO...  
2       1  N[C@H](C(=O)N1CC(c2cc(F)ccc2F)=C[C@H]1c1cccc(O...  
3       1  CC(=O)C1=NN2c3cc(F)ccc3OC[C@H]2[C@@]1(CCCN)c1c...  
4       1  CC(=O)N1N=C(c2cc(F)ccc2F)C[C@@]1(C[C@H](F)CN)c...  


Caching Morgan 2 descriptors for dude_target_kif11: 506it [00:00, 5058.56it/s]

Caching Morgan 2 descriptors for dude_target_kif11: 6966it [00:01, 4818.02it/s]
Caching MACCS descriptors for dude_target_kif11: 6966it [00:05, 1237.28it/s]
Caching RDKit descriptors for dude_target_kif11: 6966it [00:02, 3479.81it/s]
Caching OpenFEPOPS descriptors for dude_target_kif11: 4359it [00:01, 2493.69it/s]

Problem with Cc1ccccc1NC(=O)[C@@H]1[C@H]2/C=C/CCCC[C@H]21, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_kif11: 6966it [00:02, 2458.32it/s]
Assessing active recall (AUROC) for Morgan 2: 100%|██████████| 116/116 [00:02<00:00, 53.28it/s]
Assessing active recall (AUROC) for MACCS: 100%|██████████| 116/116 [00:01<00:00, 93.27it/s]
Assessing active recall (AUROC) for RDKit: 100%|██████████| 116/116 [00:01<00:00, 91.25it/s]
Assessing active recall (AUROC) for OpenFEPOPS: 100%|██████████| 116/116 [00:37<00:00,  3.13it/s]


Writing to  data/dude/processed/diversity_set
                                              SMILES  DUDEID      CHEMBLID  \
0  Nc3nc(N)c2ncn([C@H]1C[C@@H](O)[C@](CO)(C#C)O1)...  231600  CHEMBL138639   
1       Cc3nc(CCCCCCC(=O)c1ccccc1)n4nc(n2ccnc2)ccc34  205602  CHEMBL338099   
2      CCOCn2c(Cc1cc(C)cc(C)c1)c(C(C)C)c(=O)[nH]c2=O  158353  CHEMBL319139   
3       FC(F)(F)[C@@]3(C#CC1CC1)OC(=O)Nc2ccc(Cl)cc23  374719  CHEMBL223228   
4                Fc1cccc(F)c1CC/N=C(\S)Nc2ccc(Br)cn2  146194   CHEMBL88677   

   Active                            rdkit_canonical_smiles  
0       1  C#C[C@]1(CO)O[C@@H](n2cnc3c(N)nc(N)nc32)C[C@H]1O  
1       1     Cc1nc(CCCCCCC(=O)c2ccccc2)n2nc(-n3ccnc3)ccc12  
2       1     CCOCn1c(Cc2cc(C)cc(C)c2)c(C(C)C)c(=O)[nH]c1=O  
3       1      O=C1Nc2ccc(Cl)cc2[C@@](C#CC2CC2)(C(F)(F)F)O1  
4       1               Fc1cccc(F)c1CC/N=C(\S)Nc1ccc(Br)cn1  


Caching Morgan 2 descriptors for dude_target_hivrt: 19229it [00:03, 5235.12it/s]
Caching MACCS descriptors for dude_target_hivrt: 19229it [00:14, 1288.31it/s]
Caching RDKit descriptors for dude_target_hivrt: 19229it [00:05, 3651.40it/s]
Caching OpenFEPOPS descriptors for dude_target_hivrt: 1067it [00:00, 2648.00it/s]

Problem with O=C([O-])C12CC3[C@H](C[C@@H]3C1)C2, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_hivrt: 2106it [00:00, 2437.86it/s]

Problem with C/C(=N\NC(=O)C1[C@H]2CC/C=C/CC[C@H]12)c1ccc(Br)cc1, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_hivrt: 2836it [00:01, 2348.85it/s]

Problem with C[C@@H]1OP2(C[NH+]3CCCCC3)(O[C@@H](C)[C@@H](C)O2)O[C@@H]1C, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)
Problem with O=C(Nc1cccc([N+](=O)[O-])c1)[C@H]1[C@@H]2C[C@@H]3[C@@H]1C(=O)O[C@@H]3C2, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_hivrt: 4380it [00:01, 2546.53it/s]

Problem with CC12C(=O)C(=O)C3(C)[C@@H]4C(C5[C@@H]1[C@H]1CCC[C@@H]1[C@H]53)[C@@H]2[C@@H]1CCC[C@@H]14, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_hivrt: 7566it [00:02, 2686.33it/s]

Problem with C/C=C1/CN2CC[C@@]34C(=C(C=O)[C@H]1C[C@H]23)Nc1c(O)cccc14, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_hivrt: 8683it [00:03, 2617.36it/s]

Problem with C/C1=C\C[C@H]2C3(CC[C@@]2(C)[C@@H](O)[C@H]2CC=C[C@@H]12)SCCS3, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_hivrt: 18750it [00:07, 2481.14it/s]

Problem with CN1c2ncnc(Cl)c2/N=C(\F)Cc2cc(F)cc(F)c21, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_hivrt: 19229it [00:07, 2519.03it/s]
Assessing active recall (AUROC) for Morgan 2: 100%|██████████| 338/338 [00:16<00:00, 20.88it/s]
Assessing active recall (AUROC) for MACCS: 100%|██████████| 338/338 [00:10<00:00, 32.26it/s]
Assessing active recall (AUROC) for RDKit: 100%|██████████| 338/338 [00:10<00:00, 32.72it/s]
Assessing active recall (AUROC) for OpenFEPOPS: 100%|██████████| 338/338 [04:56<00:00,  1.14it/s]


Writing to  data/dude/processed/diversity_set
                                              SMILES  DUDEID       CHEMBLID  \
0                   c1ccnc(c1)NCc2ccc(cc2)CNc3ccccn3  403120   CHEMBL237830   
1       c1cc(ccc1CN2CCCNCCNCCCNCC2)CN3CCCNCCNCCCNCC3   20346    CHEMBL18442   
2  c1cc2nc(c1)CCNCCN(CCNCC2)Cc3ccc(cc3)CN4CCNCCc5...  676182  CHEMBL1202231   
3    CC1(CN2C(=CSC2=N1)CS/C(=N\C3CCCCC3)/NC4CCCCC4)C  454523   CHEMBL460491   
4  CC1(CN2C(=CSC2=N1)CS/C(=N\C3CCCCCC3)/NC4CCCCCC4)C  454524   CHEMBL518501   

   Active                             rdkit_canonical_smiles  
0       1                   c1ccc(NCc2ccc(CNc3ccccn3)cc2)nc1  
1       1       c1cc(CN2CCCNCCNCCCNCC2)ccc1CN1CCCNCCNCCCNCC1  
2       1  c1cc2nc(c1)CCNCCN(Cc1ccc(CN3CCNCCc4cccc(n4)CCN...  
3       1     CC1(C)CN2C(CS/C(=N\C3CCCCC3)NC3CCCCC3)=CSC2=N1  
4       1   CC1(C)CN2C(CS/C(=N\C3CCCCCC3)NC3CCCCCC3)=CSC2=N1  


Caching Morgan 2 descriptors for dude_target_cxcr4: 3446it [00:00, 5279.74it/s]
Caching MACCS descriptors for dude_target_cxcr4: 3446it [00:02, 1354.53it/s]
Caching RDKit descriptors for dude_target_cxcr4: 3446it [00:00, 3841.84it/s]
Caching OpenFEPOPS descriptors for dude_target_cxcr4: 3446it [00:01, 2558.54it/s]
Assessing active recall (AUROC) for Morgan 2: 100%|██████████| 40/40 [00:00<00:00, 86.53it/s]
Assessing active recall (AUROC) for MACCS: 100%|██████████| 40/40 [00:00<00:00, 167.63it/s]
Assessing active recall (AUROC) for RDKit: 100%|██████████| 40/40 [00:00<00:00, 164.81it/s]
Assessing active recall (AUROC) for OpenFEPOPS: 100%|██████████| 40/40 [00:06<00:00,  6.23it/s]


Writing to  data/dude/processed/diversity_set
                                              SMILES DUDEID  CHEMBLID  Active  \
0     S(Nc1c(O)cc(C(=O)O)cc1)(c2c(scc2)C(=O)O)(=O)=O    116       NaN       1   
1       S(Nc1ccc(C(=O)O)cc1)(c2ccsc2C(=O)[O-])(=O)=O    113       NaN       1   
2       S(Nc1cccc(c1)C(=O)O)(c2ccsc2C(=O)[O-])(=O)=O    114       NaN       1   
3   S(Nc1ccc(C(=O)O)c(c1)Cl)(c2ccsc2C(=O)[O-])(=O)=O    115       NaN       1   
4  OC(=O)[C@@H](CC1=CC=CC2=CC=CC=C12)N1C(=O)C2=C(...    211       NaN       1   

                              rdkit_canonical_smiles  
0         O=C(O)c1ccc(NS(=O)(=O)c2ccsc2C(=O)O)c(O)c1  
1         O=C(O)c1ccc(NS(=O)(=O)c2ccsc2C(=O)[O-])cc1  
2         O=C(O)c1cccc(NS(=O)(=O)c2ccsc2C(=O)[O-])c1  
3       O=C(O)c1ccc(NS(=O)(=O)c2ccsc2C(=O)[O-])cc1Cl  
4  O=C(O)c1ccc2c(c1)C(=O)N([C@H](Cc1cccc3ccccc13)...  


Caching Morgan 2 descriptors for dude_target_ampc: 2898it [00:00, 6785.93it/s]
Caching MACCS descriptors for dude_target_ampc: 2898it [00:01, 1745.53it/s]
Caching RDKit descriptors for dude_target_ampc: 2898it [00:00, 4796.14it/s]
Caching OpenFEPOPS descriptors for dude_target_ampc: 576it [00:00, 2915.24it/s]

Problem with Cc1ccc([C@H]2[C@@]([O-])([C@H](O)CO)[C@]3(O)[C@@H](c4ccc(C)cc4)[C@H]([O-])[C@@]23[O-])cc1, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_ampc: 2898it [00:00, 2935.57it/s]
Assessing active recall (AUROC) for Morgan 2: 100%|██████████| 48/48 [00:00<00:00, 134.15it/s]
Assessing active recall (AUROC) for MACCS: 100%|██████████| 48/48 [00:00<00:00, 199.49it/s]
Assessing active recall (AUROC) for RDKit: 100%|██████████| 48/48 [00:00<00:00, 195.26it/s]
Assessing active recall (AUROC) for OpenFEPOPS: 100%|██████████| 48/48 [00:06<00:00,  7.07it/s]


Writing to  data/dude/processed/diversity_set
                                              SMILES  DUDEID       CHEMBLID  \
0  CC#C[C@]5(O)CC[C@H]4[C@@H]2CCC1=CC(=O)CCC1=C2[...  674976  CHEMBL1201025   
1  C[C@@H]4C[C@H]3[C@@H]2C[C@H](F)C1=CC(=O)C=C[C@...  508532     CHEMBL1676   
2  CCN(CC(O)(CNc2cccc3n(c1ccc(F)cc1)ncc23)C(F)(F)...  560820   CHEMBL551816   
3  Oc4ccc2c(CC[C@@H]1C[C@@](O)(C#CCl)CC[C@]12Cc3c...  127333    CHEMBL77779   
4  CC#C[C@]5(O)CCC4C3CCC1=CC(=O)CC[C@]1(Cc2ccc(C)...  127976    CHEMBL78704   

   Active                             rdkit_canonical_smiles  
0       1  CC#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3[...  
1       1  C[C@@H]1C[C@H]2[C@@H]3C[C@H](F)C4=CC(=O)C=C[C@...  
2       1  CCN(CC(O)(CNc1cccc2c1cnn2-c1ccc(F)cc1)C(F)(F)F...  
3       1  Oc1ccc2c(c1)CC[C@@H]1C[C@@](O)(C#CCl)CC[C@@]21...  
4       1  CC#C[C@]1(O)CCC2C3CCC4=CC(=O)CC[C@]4(Cc4ccc(C)...  


Caching Morgan 2 descriptors for dude_target_gcr: 9770it [00:02, 4606.01it/s][16:56:56] Explicit valence for atom # 12 N, 5, is greater than permitted
Caching Morgan 2 descriptors for dude_target_gcr: 15258it [00:03, 4541.84it/s]
Caching MACCS descriptors for dude_target_gcr: 10140it [00:08, 1305.80it/s][16:57:06] Explicit valence for atom # 12 N, 5, is greater than permitted
Caching MACCS descriptors for dude_target_gcr: 15258it [00:13, 1137.42it/s]
Caching RDKit descriptors for dude_target_gcr: 9888it [00:03, 3281.65it/s][16:57:14] Explicit valence for atom # 12 N, 5, is greater than permitted
Caching RDKit descriptors for dude_target_gcr: 15258it [00:04, 3130.28it/s]
Caching OpenFEPOPS descriptors for dude_target_gcr: 3905it [00:01, 2221.66it/s]

Problem with CSc1cc(Cl)c2c(c1)C/C(Br)=N/c1c(Cl)ncnc1N2C, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_gcr: 6088it [00:02, 2255.64it/s]

Problem with CSc1cc(Br)c2c(c1)C/C(Br)=N/c1c(Cl)ncnc1N2C, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)
Problem with CSc1cc2c(c(SC)c1)N(C)c1ncnc(Cl)c1/N=C(\I)C2, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)
Problem with CN1c2ncnc(Cl)c2/N=C(\Cl)Cc2cc(-c3ccccc3O)cc(Br)c21, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_gcr: 7533it [00:03, 2326.57it/s]

Problem with CSc1cc(-c2ccccc2O)cc2c1N(C)c1ncnc(Cl)c1/N=C(\Br)C2, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_gcr: 8767it [00:03, 2264.20it/s]

Problem with CN1c2ncnc(Cl)c2/N=C(\I)Cc2cc(-c3ccccc3O)cc(O)c21, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_gcr: 9958it [00:04, 2369.43it/s][16:57:20] Explicit valence for atom # 12 N, 5, is greater than permitted
Caching OpenFEPOPS descriptors for dude_target_gcr: 15258it [00:06, 2279.40it/s]
Assessing active recall (AUROC) for Morgan 2: 100%|██████████| 258/258 [00:10<00:00, 23.81it/s]
Assessing active recall (AUROC) for MACCS: 100%|██████████| 258/258 [00:06<00:00, 41.94it/s]
Assessing active recall (AUROC) for RDKit: 100%|██████████| 258/258 [00:06<00:00, 41.87it/s]
Assessing active recall (AUROC) for OpenFEPOPS: 100%|██████████| 258/258 [02:57<00:00,  1.46it/s]


Writing to  data/dude/processed/diversity_set
                                              SMILES  DUDEID       CHEMBLID  \
0               Cc1n[nH]c(C)c1CC(=O)NCc2ccc(Cl)cc2Cl  684372  CHEMBL1210558   
1        CN2[C@H](C(=O)NCc1cccc(C(F)(F)F)c1Cl)CCC2=O  689324  CHEMBL1222883   
2  Cc4cc(N1CCC(O)CC1)cc5[nH]c(c3c(NC[C@@H](O)c2cc...  468023   CHEMBL520419   
3  Cc4cc(N1CCOCC1)cc5[nH]c(c3c(NC[C@@H](O)c2cccc(...  424433   CHEMBL401930   
4  Cc5cc(N2CCC(NC(=O)C1CC1)CC2)cc6[nH]c(c4c(NC[C@...  468227   CHEMBL480966   

   Active                             rdkit_canonical_smiles  
0       1               Cc1n[nH]c(C)c1CC(=O)NCc1ccc(Cl)cc1Cl  
1       1        CN1C(=O)CC[C@H]1C(=O)NCc1cccc(C(F)(F)F)c1Cl  
2       1  Cc1cc(N2CCC(O)CC2)cc2[nH]c(-c3c(NC[C@@H](O)c4c...  
3       1  Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NC[C@@H](O)c4cccc...  
4       1  Cc1cc(N2CCC(NC(=O)C3CC3)CC2)cc2[nH]c(-c3c(NC[C...  


Caching Morgan 2 descriptors for dude_target_cp3a4: 11970it [00:02, 4304.88it/s]
Caching MACCS descriptors for dude_target_cp3a4: 11970it [00:10, 1113.18it/s]
Caching RDKit descriptors for dude_target_cp3a4: 11970it [00:03, 3033.39it/s]
Caching OpenFEPOPS descriptors for dude_target_cp3a4: 693it [00:00, 2324.49it/s]

Problem with CN1c2ncnc(Cl)c2/N=C(\Br)Cc2cc(Br)cc(O)c21, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)
Problem with CSc1cc(I)cc2c1N(C)c1ncnc(Cl)c1/N=C(\O)C2, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_cp3a4: 5116it [00:02, 2234.27it/s]

Problem with CC(C)S(=O)(=O)c1cc(F)c2c(c1)C/C(Cc1ccccc1)=N/c1c(Cl)ncnc1N2C, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_cp3a4: 9426it [00:04, 2324.46it/s]

Problem with Cc1ccc(S(=O)(=O)O[C@@H]2[C@H](C)[C@H]3CS(=O)(=O)[C@@H]32)cc1, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_cp3a4: 11970it [00:05, 2246.89it/s]
Assessing active recall (AUROC) for Morgan 2: 100%|██████████| 170/170 [00:06<00:00, 26.87it/s]
Assessing active recall (AUROC) for MACCS: 100%|██████████| 170/170 [00:03<00:00, 52.69it/s]
Assessing active recall (AUROC) for RDKit: 100%|██████████| 170/170 [00:03<00:00, 52.63it/s]
Assessing active recall (AUROC) for OpenFEPOPS: 100%|██████████| 170/170 [01:31<00:00,  1.85it/s]


Writing to  data/dude/processed/diversity_set
                                              SMILES  DUDEID       CHEMBLID  \
0  NC[C@H](Cc1cccc(F)c1)NC(=O)c4cc(Br)c(c2ccnc3[n...  522182   CHEMBL523586   
1  Cc4n[nH]c5ccc(c3cncc(OC[C@@H](N)Cc1c[nH]c2cccc...  350698   CHEMBL379300   
2  N[C@H](COc4cncc(c3ccc2NC(=O)C(c1ccco1)c2c3)c4)...  569685   CHEMBL573326   
3  CC(C)(Cc1ccco1)C5C(=O)Nc6ccc(c4cncc(OC[C@@H](N...  350814   CHEMBL210954   
4  CCn3c(c1nonc1N)nc4c(C#CC(C)(C)O)nc(O[C@@H](CCN...  624069  CHEMBL1099297   

   Active                             rdkit_canonical_smiles  
0       1  NC[C@H](Cc1cccc(F)c1)NC(=O)c1cc(Br)c(-c2ccnc3[...  
1       1  Cc1n[nH]c2ccc(-c3cncc(OC[C@@H](N)Cc4c[nH]c5ccc...  
2       1  N[C@H](COc1cncc(-c2ccc3c(c2)C(c2ccco2)C(=O)N3)...  
3       1  CC(C)(Cc1ccco1)C1C(=O)Nc2ccc(-c3cncc(OC[C@@H](...  
4       1  CCn1c(-c2nonc2N)nc2c(C#CC(C)(C)O)nc(O[C@@H](CC...  


Caching Morgan 2 descriptors for dude_target_akt1: 16743it [00:03, 4472.92it/s]
Caching MACCS descriptors for dude_target_akt1: 16743it [00:14, 1149.11it/s]
Caching RDKit descriptors for dude_target_akt1: 16743it [00:05, 3167.95it/s]
Caching OpenFEPOPS descriptors for dude_target_akt1: 16743it [00:07, 2281.12it/s]
Assessing active recall (AUROC) for Morgan 2: 100%|██████████| 293/293 [00:14<00:00, 20.07it/s]
Assessing active recall (AUROC) for MACCS: 100%|██████████| 293/293 [00:07<00:00, 37.56it/s]
Assessing active recall (AUROC) for RDKit: 100%|██████████| 293/293 [00:07<00:00, 37.98it/s]
Assessing active recall (AUROC) for OpenFEPOPS: 100%|██████████| 293/293 [03:43<00:00,  1.31it/s]


Writing to  data/dude/processed/diversity_set
                                              SMILES  DUDEID      CHEMBLID  \
0  CC(C)CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)OC2CO[...  489942  CHEMBL477992   
1  CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)O[C@@H]3C[C@@H]...  356299  CHEMBL377329   
2  COc5cc(CN4[C@H](Cc1ccccc1)[C@H](O)CN(Cc2ccccc2...  212772  CHEMBL443030   
3  COc5ccc(S(=O)(=O)N(C[C@@H](O)[C@H](Cc1ccccc1)N...  316166  CHEMBL264818   
4  Nc5cccc(S(=O)(=O)N(C[C@@H](O)[C@H](Cc1ccccc1)N...  316187  CHEMBL362510   

   Active                             rdkit_canonical_smiles  
0       1  CC(C)CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)OC1CO[...  
1       1  CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)O[C@@H]1C[C@@H]...  
2       1  COc1cc(CN2C(=O)N(Cc3ccc(O)c(OC)c3)N(Cc3ccccc3)...  
3       1  COc1ccc(S(=O)(=O)N(C[C@@H](O)[C@H](Cc2ccccc2)N...  
4       1  Nc1cccc(S(=O)(=O)N(C[C@@H](O)[C@H](Cc2ccccc2)N...  


Caching Morgan 2 descriptors for dude_target_hivpr: 36286it [00:09, 3937.35it/s]
Caching MACCS descriptors for dude_target_hivpr: 36286it [00:31, 1139.19it/s]
Caching RDKit descriptors for dude_target_hivpr: 36286it [00:13, 2788.05it/s]
Caching OpenFEPOPS descriptors for dude_target_hivpr: 583it [00:00, 1955.02it/s]

Problem with C=CO, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_hivpr: 26071it [00:11, 2233.16it/s]

Problem with CC(C)=C/C=C/[C@H](C)[C@H]1CC[C@@]2(C)C[C@@H]3C(C)=CC(=O)[C@@H]3/C(C(=O)[O-])=C/C[C@@H]12, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_hivpr: 36286it [00:16, 2152.21it/s]
Assessing active recall (AUROC) for Morgan 2: 100%|██████████| 535/535 [00:57<00:00,  9.32it/s]
Assessing active recall (AUROC) for MACCS: 100%|██████████| 535/535 [00:30<00:00, 17.49it/s]
Assessing active recall (AUROC) for RDKit: 100%|██████████| 535/535 [00:30<00:00, 17.53it/s]
Assessing active recall (AUROC) for OpenFEPOPS: 100%|██████████| 535/535 [14:09<00:00,  1.59s/it]

Writing to  data/dude/processed/diversity_set
  target similarity_method  smiles_count  actives_count  failed_smiles  \
0  hivpr          Morgan 2         36286            535              2   
1  hivpr             MACCS         36286            535              2   
2  hivpr             RDKit         36286            535              2   
3  hivpr        OpenFEPOPS         36286            535              2   

   failed_active_smiles  average_auroc_score  median_auroc_score  \
0                     1             0.779684            0.808723   
1                     1             0.681231            0.700411   
2                     1             0.759303            0.781645   
3                     1             0.678182            0.704364   

   q1_auroc_score  q3_auroc_score  
0        0.238002        0.402323  
1        0.283952        0.315156  
2        0.309755        0.369676  
3        0.334860        0.351404  





# Output summary AUROC results table as shown in the paper

In [8]:
import pandas as pd
from pathlib import Path
display(pd.concat([pd.read_csv(f) for f in Path("data/dude/processed/diversity_set/").glob("res_df_dude_target_*.csv")]).pivot(values='average_auroc_score',index='target', columns=['similarity_method'],)[['Morgan 2', 'RDKit', 'MACCS', 'OpenFEPOPS']])

similarity_method,Morgan 2,RDKit,MACCS,OpenFEPOPS
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
akt1,0.835717,0.833538,0.741306,0.831254
ampc,0.783629,0.659837,0.67331,0.639413
cp3a4,0.602774,0.613335,0.581545,0.647462
cxcr4,0.697226,0.592485,0.854251,0.899027
gcr,0.670082,0.708346,0.66583,0.616228
hivpr,0.779684,0.759303,0.681231,0.678182
hivrt,0.651023,0.66007,0.669954,0.58224
kif11,0.763057,0.672469,0.667852,0.713142


Output an extended table with information on failed molecules etc

In [9]:
import pandas as pd
from pathlib import Path
display(pd.concat([pd.concat([pd.read_csv(f) for f in Path("data/dude/processed/diversity_set/").glob("res_df_dude_target_*.csv")]).pivot(values='average_auroc_score',index='target', columns=['similarity_method'],),pd.concat([pd.read_csv(f) for f in Path("data/dude/processed/diversity_set/").glob("res_df*.csv")]).query("similarity_method=='OpenFEPOPS'")[['target','smiles_count','actives_count','failed_smiles','failed_active_smiles']].set_index("target")], axis=1))

Unnamed: 0_level_0,MACCS,Morgan 2,OpenFEPOPS,RDKit,smiles_count,actives_count,failed_smiles,failed_active_smiles
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
akt1,0.741306,0.835717,0.831254,0.833538,16743,293,0,0
ampc,0.67331,0.783629,0.639413,0.659837,2898,48,1,0
cp3a4,0.581545,0.602774,0.647462,0.613335,11970,170,4,0
cxcr4,0.854251,0.697226,0.899027,0.592485,3446,40,0,0
gcr,0.66583,0.670082,0.616228,0.708346,15258,258,10,0
hivpr,0.681231,0.779684,0.678182,0.759303,36286,535,2,1
hivrt,0.669954,0.651023,0.58224,0.66007,19229,338,8,0
kif11,0.667852,0.763057,0.713142,0.672469,6966,116,1,0
