# Benchmark FEPOPS using the DUDE diversity set
Collect AUROC scores and compare against Morgan 2 and RDKit fingeprints
<hr>

In [26]:
import time
import numpy as np
from tqdm import tqdm
from rdkit.Chem import rdMolDescriptors

from fepops import OpenFEPOPS
from dataclasses import dataclass
from typing import Callable, Union, Optional
from pathlib import Path
import pandas as pd
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from sklearn.metrics import roc_auc_score
from fepops.fepops_persistent import get_persistent_fepops_storage_object
from typing import Union, Optional
from fepops import OpenFEPOPS
from fepops.fepops_persistent import get_persistent_fepops_storage_object
from fepops.fepops import GetFepopStatusCode
import json

open_fepops_object=OpenFEPOPS()

@dataclass
class SimilarityMethod:
    name: str
    supports_multiple_candidates: bool
    descriptor_calc_func: Optional[Callable] = None
    descriptor_score_func: Optional[Callable] = None


# OpenFEPOPS will be added to this as a persistent object, reading from a DB of cached
# molecules for each target in the diversity set
similarity_methods = {
    'Morgan 2': SimilarityMethod(
        "Morgan 2",
        False,
        lambda x: AllChem.GetMorganFingerprint(open_fepops_object._mol_from_smiles(x), 2),
        lambda x, y: DataStructs.TanimotoSimilarity(x,y),
    ),
    'MACCS': SimilarityMethod(
        "MACCS",
        False,
        lambda x: rdMolDescriptors.GetMACCSKeysFingerprint(open_fepops_object._mol_from_smiles(x)),
        lambda x, y: DataStructs.TanimotoSimilarity(x,y),
    ),
}

diversity_set_csv_files = list(Path("data/dude/processed/diversity_set/").glob("dude_target_cxcr*.csv"))
print(f"Got {len(diversity_set_csv_files)} diversity_set_csv_files : {[f.stem for f in diversity_set_csv_files]}")
for csv_file_path in diversity_set_csv_files:
    pd.read_csv(csv_file_path,sep=",",
        index_col=[0],
        header=0,
        ).loc[:,['rdkit_canonical_smiles', 'DUDEID']].reset_index().to_csv(csv_file_path.with_suffix(".smi"), sep=" ", index=None, header=False)
    

Got 1 diversity_set_csv_files : ['dude_target_cxcr4']


In [30]:
auroc_scores_info_df=pd.DataFrame()

for csv_file_path in diversity_set_csv_files:
    
    # We replace the OpenFepops similarity object at each new diversity CSV file
    # so that new databases may be loaded for speed of descriptor retrieval.
    ofepops_persistent=get_persistent_fepops_storage_object(csv_file_path.with_suffix(".db"))
    similarity_methods['OpenFEPOPS']=SimilarityMethod(
        "OpenFEPOPS",
        True,
        lambda x: ofepops_persistent.get_fepops(x, is_canonical=True),
        ofepops_persistent.calc_similarity,
    )
    
    df=pd.read_csv(csv_file_path,sep=",",
                index_col=[0],
                header=0,
            ).reset_index()
    print(df.head())
    descriptors={k:[] for k in similarity_methods.keys()}
    smiles_list = df['rdkit_canonical_smiles'].tolist()
    labels_list = df['Active'].astype(int).tolist()
    problematic_compound_indexes=[]
    for sm_name, sm in similarity_methods.items():
        # Cache all descriptors for each molecular similarity technique, as some mols may be bad and have to be removed
        for smiles_i, smiles in tqdm(enumerate(smiles_list), desc=f"Caching {sm_name} descriptors for {csv_file_path.stem}"):
            res = sm.descriptor_calc_func(smiles)
            if isinstance(res, tuple):
                if res[0]==GetFepopStatusCode.FAILED_RETRIEVED_NONE or res[0]==GetFepopStatusCode.FAILED_TO_GENERATE or res[0]==GetFepopStatusCode.FAILED_TO_RETRIEVE or res[1] is None:
                    print(f"Problem with {smiles}, {res}")
                    problematic_compound_indexes.append(smiles_i)
                    descriptors[sm_name].append(np.nan)
                else:                    
                    descriptors[sm_name].append(res[-1])
            else:
                if res is None:
                    problematic_compound_indexes.append(smiles_i)
                descriptors[sm_name].append(res)
    # Remove failed molecules from pool of descriptors and labels
    for k,v in descriptors.items():
        descriptors[k]=[v[ii] for ii in range(len(v)) if ii not in problematic_compound_indexes]
    labels_list=[labels_list[ii] for ii in range(len(labels_list)) if ii not in problematic_compound_indexes]
    auroc_scores={smn:[] for smn in similarity_methods.keys()}
    
    info=pd.Series(dtype=object)
    for sm_name, sm in similarity_methods.items():
        # Remove entries which did not return a mol
        info['target']=csv_file_path.stem.replace("dude_target_","")
        info['similarity_method']=sm_name
        info['smiles_count']=len(smiles_list)
        info['actives_count']=np.sum(labels_list)
        info['failed_smiles']=len(problematic_compound_indexes)
        info['failed_active_smiles']=len([ft for ft in problematic_compound_indexes if labels_list[ft]==1])
        for active_i in tqdm(
                np.argwhere(np.array(labels_list) == 1).flatten(),
                desc=f"Assessing active recall (AUROC) for {sm.name}",
            ):
            if sm.supports_multiple_candidates:
                scores = np.array(
                    sm.descriptor_score_func(
                        descriptors[sm_name][active_i], descriptors[sm_name]
                    ),
                    dtype=float,
                ).flatten()
            else:
                scores = np.array(
                    [
                        sm.descriptor_score_func(
                            descriptors[sm_name][active_i], descriptors[sm_name][smiles_i]
                        )
                        for smiles_i in range(len(descriptors[sm_name]))
                    ],
                    dtype=float,
                ).flatten()
            
            auroc_scores[sm_name].append(roc_auc_score(
                np.array(labels_list)[np.argwhere(~np.isnan(scores))],
                scores[np.argwhere(~np.isnan(scores))],
                )
            )
        info['average_auroc_score']=np.mean(auroc_scores[sm_name])
        info['median_auroc_score']=np.median(auroc_scores[sm_name])
        info['q1_auroc_score']=np.percentile(auroc_scores[sm_name],0.25)
        info['q3_auroc_score']=np.percentile(auroc_scores[sm_name],0.75)
        auroc_scores_info_df=pd.concat([auroc_scores_info_df, info.to_frame().T], ignore_index=True, axis=0)
    print("Writing to ", csv_file_path.parent)

    json.dump(auroc_scores,open(csv_file_path.parent/Path(f"res_scores_{csv_file_path.stem}.json"),"w"))
    print(auroc_scores_info_df)
auroc_scores_info_df.to_csv(csv_file_path.parent/Path(f"res_df_{csv_file_path.stem}.csv"))
        
print(auroc_scores_info_df)
    


                                              SMILES  DUDEID       CHEMBLID  \
0                   c1ccnc(c1)NCc2ccc(cc2)CNc3ccccn3  403120   CHEMBL237830   
1       c1cc(ccc1CN2CCCNCCNCCCNCC2)CN3CCCNCCNCCCNCC3   20346    CHEMBL18442   
2  c1cc2nc(c1)CCNCCN(CCNCC2)Cc3ccc(cc3)CN4CCNCCc5...  676182  CHEMBL1202231   
3    CC1(CN2C(=CSC2=N1)CS/C(=N\C3CCCCC3)/NC4CCCCC4)C  454523   CHEMBL460491   
4  CC1(CN2C(=CSC2=N1)CS/C(=N\C3CCCCCC3)/NC4CCCCCC4)C  454524   CHEMBL518501   

   Active                             rdkit_canonical_smiles  
0       1                   c1ccc(NCc2ccc(CNc3ccccn3)cc2)nc1  
1       1       c1cc(CN2CCCNCCNCCCNCC2)ccc1CN1CCCNCCNCCCNCC1  
2       1  c1cc2nc(c1)CCNCCN(Cc1ccc(CN3CCNCCc4cccc(n4)CCN...  
3       1     CC1(C)CN2C(CS/C(=N\C3CCCCC3)NC3CCCCC3)=CSC2=N1  
4       1   CC1(C)CN2C(CS/C(=N\C3CCCCCC3)NC3CCCCCC3)=CSC2=N1  


Caching Morgan 2 descriptors for dude_target_cxcr4: 3446it [00:01, 2343.81it/s]
Caching MACCS descriptors for dude_target_cxcr4: 3446it [00:05, 608.11it/s]
Caching OpenFEPOPS descriptors for dude_target_cxcr4: 3446it [00:00, 4735.22it/s]
Assessing active recall (AUROC) for Morgan 2: 100%|██████████| 40/40 [00:00<00:00, 62.09it/s]
Assessing active recall (AUROC) for MACCS: 100%|██████████| 40/40 [00:00<00:00, 79.36it/s]
Assessing active recall (AUROC) for OpenFEPOPS: 100%|██████████| 40/40 [00:23<00:00,  1.67it/s]

Writing to  data/dude/processed/diversity_set
  target similarity_method smiles_count actives_count failed_smiles  \
0  cxcr4          Morgan 2         3446            40             0   
1  cxcr4             MACCS         3446            40             0   
2  cxcr4        OpenFEPOPS         3446            40             0   

  failed_active_smiles average_auroc_score median_auroc_score q1_auroc_score  \
0                    0            0.697226           0.742565       0.446526   
1                    0            0.854251           0.910375       0.208633   
2                    0            0.898971           0.949655       0.386533   

  q3_auroc_score  
0       0.468805  
1       0.251604  
2       0.416588  
  target similarity_method smiles_count actives_count failed_smiles  \
0  cxcr4          Morgan 2         3446            40             0   
1  cxcr4             MACCS         3446            40             0   
2  cxcr4        OpenFEPOPS         3446            40      




In [2]:
fb = FepopsBenchmarker("data/dude/processed/diversity_set/dude_target_cxcr4.db")
fb.auroc_performance("data/dude/processed/diversity_set/dude_target_cxcr4.csv")

Caching descriptors for Morgan:   0%|          | 0/3446 [00:00<?, ?it/s]

Caching descriptors for Morgan: 100%|██████████| 3446/3446 [00:00<00:00, 5342.35it/s]
Caching descriptors for FEPOPS: 100%|██████████| 3446/3446 [00:00<00:00, 8624.96it/s]
Assessing active recall (AUROC) for Morgan: 100%|██████████| 40/40 [00:00<00:00, 132.70it/s]
Assessing active recall (AUROC) for FEPOPS:   0%|          | 0/40 [00:00<?, ?it/s]


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 88 is different from 154)

In [8]:
pd.concat([pd.read_csv(f) for f in Path("data/dude/processed/diversity_set/").glob("res_df*.csv")]).pivot(values='average_auroc_score',index='target', columns=['similarity_method'],)

similarity_method,Morgan 2,OpenFEPOPS
target,Unnamed: 1_level_1,Unnamed: 2_level_1
akt1,0.835717,0.828947
ampc,0.783629,0.639115
cp3a4,0.602774,0.649807
cxcr4,0.697226,0.898971
gcr,0.670138,0.616173
hivpr,0.779689,0.677882
hivrt,0.651011,0.583981
kif11,0.763058,0.713152


In [13]:
from scipy.stats import mannwhitneyu
fepops_benchmark_performance_df=pd.concat([pd.read_csv(f) for f in Path("data/dude/processed/diversity_set/").glob("res_df*.csv")]).pivot(values='average_auroc_score',index='target', columns=['similarity_method'],)
for target, scores_by_sm in {f.stem.split("_")[-1]:json.load(open(f)) for f in Path("data/dude/processed/diversity_set/").glob("res_scores*.json")}.items():
    print(target, mannwhitneyu(*scores_by_sm.values()))

kif11 MannwhitneyuResult(statistic=9756.0, pvalue=3.1618616130522878e-09)
hivrt MannwhitneyuResult(statistic=88174.0, pvalue=2.122934919642773e-34)
hivpr MannwhitneyuResult(statistic=226147.0, pvalue=1.1956892799046625e-60)
ampc MannwhitneyuResult(statistic=1726.0, pvalue=2.640883442863102e-05)
cxcr4 MannwhitneyuResult(statistic=163.0, pvalue=9.077946053273235e-10)
gcr MannwhitneyuResult(statistic=46891.0, pvalue=9.287514714384349e-16)
cp3a4 MannwhitneyuResult(statistic=8847.0, pvalue=6.31954006944608e-10)
akt1 MannwhitneyuResult(statistic=45065.0, pvalue=0.29635499822019484)
