# Prepare the DUDE database for analysis

Perform preprocessing on the DUDE database.  Assumes that the DUDE database has been downloaded and placed in the path "data/dude/unprocessed". Here, we expect directories for each target containing 'actives_final.ism' and 'decoys_final.ism'.
It may be beneficial to split the below out of this Jupyter notebook and deploy to HPC or a non interactive environment. To pre-cache descriptors, run cache_mols_from_csv() on the DudePreprocessor object below.

The step may also be skipped by downloading the OpenFEPOPS-DUDE data directory created for analysis of the DUDE diversity set from FigShare at https://doi.org/10.6084/m9.figshare.23951445.v2.

In [1]:
from fepops.utils import DudePreprocessor
dude_preprocessor=DudePreprocessor()
dude_preprocessor()

data/dude/
Processing the following DUDE targets: []


Preparing targets: 0it [00:00, ?it/s]


# Analyse the DUDE diversity set and obtain mean and standard deviations for each 'standard' set of FEPOPS descriptors

Below, we read in the DUDE diversity set and gather the mean and standard deviation of the produced FEPOPS descriptors and use these values as defaults for scaling (before scoring) all FEPOPS descriptors. Before running this, there should be 8 targets:

* akt1
* ampc
* cp3a4
* cxcr4
* gcr
* hivpr
* hivrt
* kif11

represented by their associated .csv and .db files present in the data/dude/processed/diversity_set/ directory

In [1]:
from pathlib import Path
from tqdm import tqdm
from fepops.fepops_persistent import get_persistent_fepops_storage_object
import numpy as np
dude_diversity_set_path=Path("data/dude/processed/diversity_set/")
diversity_target_files=list(dude_diversity_set_path.glob("dude_target_*.csv"))
descriptors=[]
for diversity_target in diversity_target_files:
    f=get_persistent_fepops_storage_object(diversity_target.with_suffix(".db"))
    print(f"Working on {diversity_target}")
    for (orig_smi, dude_id, chemblid, active_flag, can_smi) in tqdm([l.strip().split(",") for l in open(diversity_target).readlines()[1:] if len(l)>3]):
        status, retrieved_descriptors=f.get_fepops(can_smi)
        if status.value ==1:
            for d in retrieved_descriptors:
                descriptors.append(d)
            
descriptors=np.array(descriptors)
display("Mean:", descriptors.mean(axis=0))
display("Std:", descriptors.std(axis=0))

Working on data/dude/processed/diversity_set/dude_target_kif11.csv


100%|██████████| 6966/6966 [00:02<00:00, 2522.41it/s]


Working on data/dude/processed/diversity_set/dude_target_hivrt.csv


100%|██████████| 19229/19229 [00:07<00:00, 2640.52it/s]


Working on data/dude/processed/diversity_set/dude_target_cxcr4.csv


100%|██████████| 3446/3446 [00:01<00:00, 2641.72it/s]


Working on data/dude/processed/diversity_set/dude_target_ampc.csv


100%|██████████| 2898/2898 [00:00<00:00, 3033.43it/s]


Working on data/dude/processed/diversity_set/dude_target_gcr.csv


 66%|██████▋   | 10142/15258 [00:04<00:02, 2546.86it/s][22:46:45] Explicit valence for atom # 12 N, 5, is greater than permitted
 70%|██████▉   | 10663/15258 [00:04<00:01, 2571.43it/s]

Could not parse smiles to a valid molecule, smiles was: N#CC(NC(=O)c1ccccc1)=N1=C(c2ccccc2)C=C(c2ccccc2)C=C1c1ccccc1


100%|██████████| 15258/15258 [00:06<00:00, 2433.03it/s]


Working on data/dude/processed/diversity_set/dude_target_cp3a4.csv


100%|██████████| 11970/11970 [00:05<00:00, 2326.70it/s]


Working on data/dude/processed/diversity_set/dude_target_akt1.csv


100%|██████████| 16743/16743 [00:06<00:00, 2414.06it/s]


Working on data/dude/processed/diversity_set/dude_target_hivpr.csv


100%|██████████| 36286/36286 [00:16<00:00, 2257.40it/s]


'Mean:'

array([-0.28971602,  0.5181022 ,  0.37487135,  0.99922747, -0.04187301,
        1.03382471,  0.27407036,  0.99853436,  0.09725517,  1.12824307,
        0.23735556,  0.99882914,  0.35977538,  0.66653514,  0.41238282,
        0.99902545,  5.71261449,  6.37716992,  6.47293777,  6.26134733,
        6.20354385,  6.23201498])

'Std:'

array([0.35110473, 1.00839329, 0.4838859 , 0.02769204, 0.15418035,
       0.86446056, 0.44583626, 0.0381767 , 0.16095862, 0.92079483,
       0.42526185, 0.03413741, 0.35756229, 1.36093993, 0.4921059 ,
       0.0311619 , 1.9668792 , 2.31266486, 2.50699385, 2.41269982,
       2.30018205, 2.31527129])

# Benchmark FEPOPS using the DUDE diversity set
Collect AUROC scores and compare against Morgan 2 and RDKit fingeprints
<hr>

Perform imports, define similarity methods (except OpenFEPOPS which will be defined upon target assessment in order to load cached descriptors)

In [2]:
import numpy as np
from tqdm import tqdm

from dataclasses import dataclass
from typing import Callable, Optional
from pathlib import Path
import pandas as pd
from rdkit import Chem

from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from sklearn.metrics import roc_auc_score
from fepops import OpenFEPOPS
from fepops.fepops_persistent import get_persistent_fepops_storage_object
from fepops.fepops import GetFepopStatusCode
import json

open_fepops_object=OpenFEPOPS()

@dataclass
class SimilarityMethod:
    name: str
    supports_multiple_candidates: bool
    descriptor_calc_func: Optional[Callable] = None
    descriptor_score_func: Optional[Callable] = None


# OpenFEPOPS will be added to this as a persistent object, reading from a DB of cached
# molecules for each target in the diversity set
similarity_methods = {
    'Morgan 2': SimilarityMethod(
        "Morgan 2",
        False,
        lambda x: AllChem.GetMorganFingerprint(x, 2),
        lambda x, y: DataStructs.TanimotoSimilarity(x,y),
    ),
    'MACCS': SimilarityMethod(
        "MACCS",
        False,
        lambda x: rdMolDescriptors.GetMACCSKeysFingerprint(x),
        lambda x, y: DataStructs.TanimotoSimilarity(x,y),
    ),
    'RDKit': SimilarityMethod(
        "RDKit",
        False,
        lambda x: Chem.RDKFingerprint(x,maxPath=4),
        lambda x, y: DataStructs.TanimotoSimilarity(x,y),
    ),
}

diversity_set_csv_files = list(Path("data/dude/processed/diversity_set/").glob("dude_target_hiv*.csv"))

# Write all CSVs to SMILES files
print(f"Got {len(diversity_set_csv_files)} diversity_set_csv_files : {[f.stem for f in diversity_set_csv_files]}")
for csv_file_path in diversity_set_csv_files:
    pd.read_csv(csv_file_path,sep=",",
        index_col=[0],
        header=0,
        ).loc[:,['rdkit_canonical_smiles', 'DUDEID']].reset_index().to_csv(csv_file_path.with_suffix(".smi"), sep=" ", index=None, header=False)
    

Got 1 diversity_set_csv_files : ['dude_target_ampc']


Perform AUROC score calculation

In [10]:
auroc_scores_info_df=pd.DataFrame()

for csv_file_path in diversity_set_csv_files:
    
    # We replace the OpenFepops similarity object at each new diversity CSV file
    # so that new databases may be loaded for speed of descriptor retrieval.
    ofepops_persistent=get_persistent_fepops_storage_object(csv_file_path.with_suffix(".db"))
    similarity_methods['OpenFEPOPS']=SimilarityMethod(
        "OpenFEPOPS",
        True,
        lambda x: ofepops_persistent.get_fepops(x, is_canonical=True),
        ofepops_persistent.calc_similarity,
    )
    
    df=pd.read_csv(csv_file_path,sep=",",
                index_col=[0],
                header=0,
            ).reset_index()
    print(df.head())
    descriptors={k:[] for k in similarity_methods.keys()}
    smiles_list = df['rdkit_canonical_smiles'].tolist()
    labels_list = df['Active'].astype(int).tolist()
    problematic_compound_indexes=[]
    for sm_name, sm in similarity_methods.items():
        # Cache all descriptors for each molecular similarity technique, as some mols may be bad and have to be removed
        for smiles_i, smiles in tqdm(enumerate(smiles_list), desc=f"Caching {sm_name} descriptors for {csv_file_path.stem}"):
            mol_from_smiles=open_fepops_object._mol_from_smiles(smiles)
            if mol_from_smiles is None:
                problematic_compound_indexes.append(smiles_i)
                descriptors[sm_name].append(np.nan)
            else:
                res = sm.descriptor_calc_func(mol_from_smiles)
                if isinstance(res, tuple):
                    if res[0]==GetFepopStatusCode.FAILED_RETRIEVED_NONE or res[0]==GetFepopStatusCode.FAILED_TO_GENERATE or res[0]==GetFepopStatusCode.FAILED_TO_RETRIEVE or res[1] is None:
                        print(f"Problem with {smiles}, {res}")
                        problematic_compound_indexes.append(smiles_i)
                        descriptors[sm_name].append(np.nan)
                    else:                    
                        descriptors[sm_name].append(res[-1])
                else:
                    if res is None:
                        problematic_compound_indexes.append(smiles_i)
                    descriptors[sm_name].append(res)
    # Remove failed molecules from pool of descriptors and labels
    for k,v in descriptors.items():
        descriptors[k]=[v[ii] for ii in range(len(v)) if ii not in problematic_compound_indexes]
    labels_list=[labels_list[ii] for ii in range(len(labels_list)) if ii not in problematic_compound_indexes]
    auroc_scores={smn:[] for smn in similarity_methods.keys()}
    
    info=pd.Series(dtype=object)
    for sm_name, sm in similarity_methods.items():
        # Remove entries which did not return a mol
        info['target']=csv_file_path.stem.replace("dude_target_","")
        info['similarity_method']=sm_name
        info['smiles_count']=len(smiles_list)
        info['actives_count']=np.sum(labels_list)
        info['failed_smiles']=len(problematic_compound_indexes)
        info['failed_active_smiles']=len([ft for ft in problematic_compound_indexes if labels_list[ft]==1])
        for active_i in tqdm(
                np.argwhere(np.array(labels_list) == 1).flatten(),
                desc=f"Assessing active recall (AUROC) for {sm.name}",
            ):
            if sm.supports_multiple_candidates:
                scores = np.array(
                    sm.descriptor_score_func(
                        descriptors[sm_name][active_i], descriptors[sm_name]
                    ),
                    dtype=float,
                ).flatten()
            else:
                scores = np.array(
                    [
                        sm.descriptor_score_func(
                            descriptors[sm_name][active_i], descriptors[sm_name][smiles_i]
                        )
                        for smiles_i in range(len(descriptors[sm_name]))
                    ],
                    dtype=float,
                ).flatten()
            
            auroc_scores[sm_name].append(roc_auc_score(
                np.array(labels_list)[np.argwhere(~np.isnan(scores))],
                scores[np.argwhere(~np.isnan(scores))],
                )
            )
        info['average_auroc_score']=np.mean(auroc_scores[sm_name])
        info['median_auroc_score']=np.median(auroc_scores[sm_name])
        info['q1_auroc_score']=np.percentile(auroc_scores[sm_name],0.25)
        info['q3_auroc_score']=np.percentile(auroc_scores[sm_name],0.75)
        auroc_scores_info_df=pd.concat([auroc_scores_info_df, info.to_frame().T], ignore_index=True, axis=0)
    print("Writing to ", csv_file_path.parent)

    json.dump(auroc_scores,open(csv_file_path.parent/Path(f"res_scores_{csv_file_path.stem}.json"),"w"))
    auroc_scores_info_df.to_csv(csv_file_path.parent/Path(f"res_df_{csv_file_path.stem}.csv"))
print(auroc_scores_info_df)
    

                                              SMILES DUDEID  CHEMBLID  Active  \
0     S(Nc1c(O)cc(C(=O)O)cc1)(c2c(scc2)C(=O)O)(=O)=O    116       NaN       1   
1       S(Nc1ccc(C(=O)O)cc1)(c2ccsc2C(=O)[O-])(=O)=O    113       NaN       1   
2       S(Nc1cccc(c1)C(=O)O)(c2ccsc2C(=O)[O-])(=O)=O    114       NaN       1   
3   S(Nc1ccc(C(=O)O)c(c1)Cl)(c2ccsc2C(=O)[O-])(=O)=O    115       NaN       1   
4  OC(=O)[C@@H](CC1=CC=CC2=CC=CC=C12)N1C(=O)C2=C(...    211       NaN       1   

                              rdkit_canonical_smiles  
0         O=C(O)c1ccc(NS(=O)(=O)c2ccsc2C(=O)O)c(O)c1  
1         O=C(O)c1ccc(NS(=O)(=O)c2ccsc2C(=O)[O-])cc1  
2         O=C(O)c1cccc(NS(=O)(=O)c2ccsc2C(=O)[O-])c1  
3       O=C(O)c1ccc(NS(=O)(=O)c2ccsc2C(=O)[O-])cc1Cl  
4  O=C(O)c1ccc2c(c1)C(=O)N([C@H](Cc1cccc3ccccc13)...  


Caching Morgan 2 descriptors for dude_target_ampc: 2898it [00:00, 6261.77it/s]
Caching MACCS descriptors for dude_target_ampc: 2898it [00:01, 1745.90it/s]
Caching RDKit descriptors for dude_target_ampc: 2898it [00:00, 3455.80it/s]
Caching OpenFEPOPS descriptors for dude_target_ampc: 585it [00:00, 2932.53it/s]

Problem with Cc1ccc([C@H]2[C@@]([O-])([C@H](O)CO)[C@]3(O)[C@@H](c4ccc(C)cc4)[C@H]([O-])[C@@]23[O-])cc1, (<GetFepopStatusCode.FAILED_RETRIEVED_NONE: 4>, None)


Caching OpenFEPOPS descriptors for dude_target_ampc: 2898it [00:00, 3002.50it/s]
Assessing active recall (AUROC) for Morgan 2: 100%|██████████| 48/48 [00:00<00:00, 135.08it/s]
Assessing active recall (AUROC) for MACCS: 100%|██████████| 48/48 [00:00<00:00, 203.02it/s]
Assessing active recall (AUROC) for RDKit: 100%|██████████| 48/48 [00:00<00:00, 200.67it/s]
Assessing active recall (AUROC) for OpenFEPOPS: 100%|██████████| 48/48 [00:06<00:00,  7.04it/s]

Writing to  data/dude/processed/diversity_set
  target similarity_method smiles_count actives_count failed_smiles  \
0   ampc          Morgan 2         2898            48             1   
1   ampc             MACCS         2898            48             1   
2   ampc             RDKit         2898            48             1   
3   ampc        OpenFEPOPS         2898            48             1   

  failed_active_smiles average_auroc_score median_auroc_score q1_auroc_score  \
0                    0            0.783629           0.831864       0.315866   
1                    0             0.67331           0.711149       0.365732   
2                    0            0.659837           0.662738       0.349989   
3                    0            0.639413           0.736706        0.22802   

  q3_auroc_score  
0       0.321303  
1       0.377163  
2       0.361342  
3        0.23266  





Output summary AUROC results table as shown in the paper

In [15]:
display(pd.concat([pd.read_csv(f) for f in Path("data/dude/processed/diversity_set/").glob("res_df*.csv")]).pivot(values='average_auroc_score',index='target', columns=['similarity_method'],)[['Morgan 2', 'RDKit', 'MACCS', 'OpenFEPOPS']])

similarity_method,Morgan 2,RDKit,MACCS,OpenFEPOPS
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
akt1,0.835717,0.833538,0.741306,0.831254
ampc,0.783629,0.659837,0.67331,0.639413
cp3a4,0.602774,0.613335,0.581545,0.647462
cxcr4,0.697226,0.592485,0.854251,0.899027
gcr,0.670082,0.708346,0.66583,0.616228
hivpr,0.779684,0.759303,0.681231,0.678182
hivrt,0.651023,0.66007,0.669954,0.58224
kif11,0.763057,0.672469,0.667852,0.713142


Output an extended table with information on failed molecules etc

In [16]:
display(pd.concat([pd.concat([pd.read_csv(f) for f in Path("data/dude/processed/diversity_set/").glob("res_df*.csv")]).pivot(values='average_auroc_score',index='target', columns=['similarity_method'],),pd.concat([pd.read_csv(f) for f in Path("data/dude/processed/diversity_set/").glob("res_df*.csv")]).query("similarity_method=='OpenFEPOPS'")[['target','smiles_count','actives_count','failed_smiles','failed_active_smiles']].set_index("target")], axis=1))

Unnamed: 0_level_0,MACCS,Morgan 2,OpenFEPOPS,RDKit,smiles_count,actives_count,failed_smiles,failed_active_smiles
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
akt1,0.741306,0.835717,0.831254,0.833538,16743,293,0,0
ampc,0.67331,0.783629,0.639413,0.659837,2898,48,1,0
cp3a4,0.581545,0.602774,0.647462,0.613335,11970,170,4,0
cxcr4,0.854251,0.697226,0.899027,0.592485,3446,40,0,0
gcr,0.66583,0.670082,0.616228,0.708346,15258,258,10,0
hivpr,0.681231,0.779684,0.678182,0.759303,36286,535,2,1
hivrt,0.669954,0.651023,0.58224,0.66007,19229,338,8,0
kif11,0.667852,0.763057,0.713142,0.672469,6966,116,1,0
