In [1]:
import warnings
warnings.simplefilter(action="ignore")
import pandas as pd
import numpy as np
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs import BulkTanimotoSimilarity
from molvs.fragment import LargestFragmentChooser
import swifter

In [2]:
salt_fixer_object = LargestFragmentChooser(prefer_organic=True)

def smiles_format(sm):
    try:
        mol = Chem.MolFromSmiles(sm)
        mol = salt_fixer_object.choose(mol)
        sm_final = Chem.MolToSmiles(mol, canonical=True, isomericSmiles=True, kekuleSmiles=False)
    except:
        sm_final = np.nan
    return sm_final

def full_smiles_cleaner(df):
    unique_smiles = set(df['SMILES'])
    smiles_standardized_dict = {smile: smiles_format(smile) for smile in tqdm(unique_smiles)}
    df['SMILES'] = df['SMILES'].map(smiles_standardized_dict)
    df = df[~df['SMILES'].isna()]
    return df

def ECFP4(smile):
    finger_prints = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile), 2, nBits=2048)
    return finger_prints

def bulk_tanimoto(fp1, list_of_fps):
    similarity_matrix = BulkTanimotoSimilarity(fp1, list_of_fps)
    return similarity_matrix

def max_tanimoto_finder(list1_fps, list2_fps):
    max_tanimoto_list = []
    for curr_fp in list1_fps:
        curr_tanimoto_list = bulk_tanimoto(curr_fp, list2_fps)
        max_tanimoto_list.append(np.max(curr_tanimoto_list))
    return max_tanimoto_list

def run_tanimoto_sim(comparison_df, hits_df, column_name=None):
    comparison_ECFP4_list = list(comparison_df.swifter.apply(lambda comparison_df : ECFP4(comparison_df['SMILES']), axis=1))
    hits_ECFP4_list = list(hits_df.swifter.apply(lambda hits_df : ECFP4(hits_df['SMILES']), axis=1))
    max_tanimoto_list = max_tanimoto_finder(hits_ECFP4_list, comparison_ECFP4_list)
    hits_df[f'{column_name}_NN_Tanimoto_ECFP4_2048'] = max_tanimoto_list
    return hits_df


def calculate_mean_pairwise_tanimoto(hits_df):
    # Convert SMILES to ECFP4 fingerprints
    hits_ECFP4_list = list(hits_df.swifter.apply(lambda row: ECFP4(row['SMILES']), axis=1))
    n = len(hits_ECFP4_list)
    total_similarity = 0.0
    pair_count = 0
    
    # Calculate pairwise Tanimoto similarities
    for i in range(n):
        similarities = BulkTanimotoSimilarity(hits_ECFP4_list[i], hits_ECFP4_list)
        total_similarity += np.sum(similarities) - 1  # Exclude self-similarity
        pair_count += n - 1
    
    # Calculate mean pairwise similarity
    mean_similarity = total_similarity / pair_count
    return mean_similarity

# Paper 1 (10.1126/sciadv.abg3338)

In [3]:
paper_1_hits_df = pd.read_csv('data/Paper 1/Paper 1 Hits.csv')
paper_1_fine_tune_df = pd.read_csv('data/Paper 1/Paper 1 Fine Tuning Data.csv')
paper_1_chembl_df = pd.read_csv('data/Paper 1/Paper_1_Chembl.csv')


#### Data Cleanup and SMILES Standardization

In [4]:
paper_1_hits_df = paper_1_hits_df[['Compound Name', 'SMILES', 'LXRa EC50 (µM)']]
paper_1_hits_df = full_smiles_cleaner(paper_1_hits_df)
paper_1_fine_tune_df = full_smiles_cleaner(paper_1_fine_tune_df)
paper_1_chembl_df = full_smiles_cleaner(paper_1_chembl_df)

100%|██████████| 12/12 [00:00<00:00, 612.55it/s]
100%|██████████| 37/37 [00:00<00:00, 730.32it/s]
100%|██████████| 1257/1257 [00:01<00:00, 751.93it/s]


In [5]:
paper_1_hits_df = run_tanimoto_sim(paper_1_fine_tune_df, paper_1_hits_df, column_name="FineTune")
paper_1_hits_df = run_tanimoto_sim(paper_1_chembl_df, paper_1_hits_df, column_name="ChEMBL")

Pandas Apply:   0%|          | 0/39 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/12 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2719 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/12 [00:00<?, ?it/s]

In [6]:
paper_1_hits_df

Unnamed: 0,Compound Name,SMILES,LXRa EC50 (µM),FineTune_NN_Tanimoto_ECFP4_2048,ChEMBL_NN_Tanimoto_ECFP4_2048
0,Compound 1,CN(C(=O)C(c1ccccc1)c1ccccc1)c1ccc(C(O)(C(F)(F)...,4.5 ± 0.1,0.642857,0.642857
1,Compound 5,CN(C(=O)c1c(F)cc(F)cc1F)c1ccc(C(O)(C(F)(F)F)C(...,0.26 ± 0.01,0.604651,0.604651
2,Compound 6,CN(C(=O)c1c(F)cccc1Cl)c1ccc(C(O)(C(F)(F)F)C(F)...,0.183 ± 0.006,0.586957,0.586957
3,Compound 8,CN(C(=O)c1ccc(F)cc1)c1ccc(C(O)(C(F)(F)F)C(F)(F...,1.05 ± 0.01,0.763158,0.763158
4,Compound 9,CN(C(=O)c1ccc(F)cc1Cl)c1ccc(C(O)(C(F)(F)F)C(F)...,1.68 ± 0.03,0.553191,0.553191
5,Compound 10,CN(C(=O)c1cccc(C(F)(F)F)c1)c1ccc(C(O)(C(F)(F)F...,1.19 ± 0.01,0.731707,0.731707
6,Compound 11,CN(C(=O)c1cccc(Cl)c1Cl)c1ccc(C(O)(C(F)(F)F)C(F...,1.31 ± 0.03,0.613636,0.613636
7,Compound 12,CN(C(=O)c1cccc(F)c1)c1ccc(C(O)(C(F)(F)F)C(F)(F...,0.8 ± 0.3,0.731707,0.731707
8,Compound 13,CN(C(=O)c1cccc2ccccc12)c1ccc(C(O)(C(F)(F)F)C(F...,1.1 ± 0.1,0.636364,0.636364
9,Compound 14,CN(C(=O)c1ccccc1Cl)c1ccc(C(O)(C(F)(F)F)C(F)(F)...,0.30 ± 0.01,0.666667,0.666667


In [7]:
round(paper_1_hits_df['FineTune_NN_Tanimoto_ECFP4_2048'].mean(), 2)

0.66

In [8]:
round(paper_1_hits_df['ChEMBL_NN_Tanimoto_ECFP4_2048'].mean(), 2)

0.66

In [9]:
round(calculate_mean_pairwise_tanimoto(paper_1_hits_df), 2)

Pandas Apply:   0%|          | 0/12 [00:00<?, ?it/s]

0.56

# Paper 2 (10.1021/acs.jcim.2c00068)

In [10]:
paper_2_hits_df = pd.read_csv('data/Paper 2/Paper 2 Hits.csv')
paper_2_chembl_df = pd.read_csv('data/Paper 2/Paper_2_Chembl.csv')

#### Data Cleanup and SMILES Standardization

In [11]:
paper_2_hits_df = full_smiles_cleaner(paper_2_hits_df)
paper_2_chembl_df = full_smiles_cleaner(paper_2_chembl_df)

100%|██████████| 15/15 [00:00<00:00, 678.87it/s]
100%|██████████| 809/809 [00:01<00:00, 753.08it/s]


In [12]:
paper_2_hits_df = run_tanimoto_sim(paper_2_chembl_df, paper_2_hits_df, column_name="ChEMBL")

Pandas Apply:   0%|          | 0/1213 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/15 [00:00<?, ?it/s]

In [13]:
paper_2_hits_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Compound Name,SMILES,MERTK IC50 (nM),ChEMBL_NN_Tanimoto_ECFP4_2048
0,0,0,4b,N[C@H]1CC[C@H](Nc2nc(Nc3ccc(S(=O)(=O)N4CCOCC4)...,271.8,0.518072
1,1,1,4d,CN1CCN(Cc2cccc(Nc3nc(N[C@H]4CC[C@H](N)CC4)c4c[...,2911.0,0.344444
2,2,2,5a,N[C@H]1CC[C@H](Nc2nc(Nc3ccc(S(=O)(=O)N4CCOCC4)...,53.4,0.506024
3,3,3,5c,CN1CCN(Cc2cccc(Nc3nc(N[C@H]4CC[C@H](N)CC4)c4cc...,259.4,0.367816
4,4,4,8a,NC(=O)C1CCCN(c2nc(Nc3ccc(S(=O)(=O)N4CCOCC4)cc3...,679.0,0.377778
5,5,5,8b,O=C1NCCC12CCCN(c1nc(Nc3ccc(S(=O)(=O)N4CCOCC4)c...,1022.0,0.347826
6,6,6,8c,NC(=O)c1cccc(-c2nc(Nc3ccc(S(=O)(=O)N4CCOCC4)cc...,505.9,0.362637
7,7,7,8d,NC(=O)c1cn(-c2nc(Nc3ccc(S(=O)(=O)N4CCOCC4)cc3)...,5961.0,0.366667
8,8,8,15a,O=S(=O)(c1ccc(Nc2ncc3c(-c4cn[nH]c4)c[nH]c3n2)c...,1112.0,0.390244
9,9,9,15b,CNC(=O)c1ccc(Nc2ncc3c(-c4cn[nH]c4)c[nH]c3n2)cc1,1007.0,0.320988


In [14]:
round(paper_2_hits_df['ChEMBL_NN_Tanimoto_ECFP4_2048'].mean(), 2)

0.37

In [15]:
round(calculate_mean_pairwise_tanimoto(paper_2_hits_df), 2)

Pandas Apply:   0%|          | 0/15 [00:00<?, ?it/s]

0.28

# Paper 4 (10.1021/acsmedchemlett.2c00515)

In [16]:
paper_4_hits_df = pd.read_csv('data/Paper 4/Paper 4 Hits.csv')
paper_4_seeds_df = pd.read_csv('data/Paper 4/Paper 4 Seeds.csv')
paper_4_chembl_df = pd.read_csv('data/Paper 4/Paper_4_Chembl.csv')

#### Data Cleanup and SMILES Standardization

In [17]:
paper_4_hits_df = full_smiles_cleaner(paper_4_hits_df)
paper_4_seeds_df = full_smiles_cleaner(paper_4_seeds_df)
paper_4_chembl_df = full_smiles_cleaner(paper_4_chembl_df)

100%|██████████| 4/4 [00:00<00:00, 713.01it/s]
100%|██████████| 10/10 [00:00<00:00, 721.65it/s]
100%|██████████| 3686/3686 [00:04<00:00, 775.09it/s]


In [18]:
paper_4_hits_df = run_tanimoto_sim(paper_4_seeds_df, paper_4_hits_df, column_name="Seeds")
paper_4_hits_df = run_tanimoto_sim(paper_4_chembl_df, paper_4_hits_df, column_name="ChEMBL")

Pandas Apply:   0%|          | 0/10 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/4 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/4654 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/4 [00:00<?, ?it/s]

In [19]:
paper_4_hits_df

Unnamed: 0.1,Unnamed: 0,Compound Name,SMILES,CDK2 IC50 (µM),Seeds_NN_Tanimoto_ECFP4_2048,ChEMBL_NN_Tanimoto_ECFP4_2048
0,0,Compound 1,CC(C)(C)c1cnc(CSc2nc(NC(=O)C3CCNCC3)ncc2Cl)o1,8.0,0.630769,0.630769
1,1,Compound 6,CC(C)Nc1nc2[nH]c(-c3ccc(S(C)(=O)=O)cc3)nc(=O)c...,5.2,0.163043,0.380952
2,2,Compound 8,Cc1c(-c2ccc(F)cc2)ccnc1Nc1ccnc(N)n1,1.5,0.37931,0.3125
3,3,Compound 10,CC(C)NC(=O)O[C@@H]1CC[C@H](c2cnc3[nH]c(-c4ccnn...,0.0015,0.390805,0.32


In [20]:
round(paper_4_hits_df['Seeds_NN_Tanimoto_ECFP4_2048'].mean(), 2)


0.39

In [21]:
round(paper_4_hits_df['ChEMBL_NN_Tanimoto_ECFP4_2048'].mean(), 2)

0.41

In [22]:
round(calculate_mean_pairwise_tanimoto(paper_4_hits_df), 2)

Pandas Apply:   0%|          | 0/4 [00:00<?, ?it/s]

0.14

# Paper 5 (10.1038/s42004-022-00733-0)

In [23]:
paper_5_hits_df = pd.read_csv('data/Paper 5/Paper 5 Hits.csv')
paper_5_training_df = pd.read_csv('data/Paper 5/Paper 5 Training Data.csv')
paper_5_reported_NN_df = pd.read_csv('data/Paper 5/Paper_5_Reported_Nearest_Neighbor.csv')
paper_5_chembl_df = pd.read_csv('data/Paper 5/Paper_5_Chembl.csv')

#### Data Cleanup and SMILES Standardization

In [24]:
paper_5_hits_df = full_smiles_cleaner(paper_5_hits_df)
paper_5_training_df = full_smiles_cleaner(paper_5_training_df)
paper_5_reported_NN_df = full_smiles_cleaner(paper_5_reported_NN_df)
paper_5_chembl_df = full_smiles_cleaner(paper_5_chembl_df)

100%|██████████| 4/4 [00:00<00:00, 794.11it/s]
100%|██████████| 4081/4081 [00:05<00:00, 749.36it/s]
100%|██████████| 3/3 [00:00<00:00, 840.77it/s]
100%|██████████| 10838/10838 [00:14<00:00, 744.99it/s]


In [25]:
paper_5_hits_df = run_tanimoto_sim(paper_5_training_df, paper_5_hits_df, column_name="Train")
paper_5_hits_df = run_tanimoto_sim(paper_5_reported_NN_df, paper_5_hits_df, column_name="Reported")
paper_5_hits_df = run_tanimoto_sim(paper_5_chembl_df, paper_5_hits_df, column_name="ChEMBL")


Pandas Apply:   0%|          | 0/4081 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/4 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/4 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/4 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/20788 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/4 [00:00<?, ?it/s]

In [26]:
paper_5_hits_df

Unnamed: 0,Compound Name,SMILES,EGFR IC50 (µM),Train_NN_Tanimoto_ECFP4_2048,Reported_NN_Tanimoto_ECFP4_2048,ChEMBL_NN_Tanimoto_ECFP4_2048
0,Compound 1,Fc1ccc(Nc2ncnc3ccc(Br)cc23)cc1,0.03,0.590909,0.534884,0.604651
1,Compound 2,Nc1c(Cl)cc(Nc2ncnc3ccc(F)cc23)cc1Cl,0.04,0.5,0.489362,0.630435
2,Compound 3,Cc1ccc(Nc2ncnc3cccc(F)c23)cc1F,0.21,0.458333,0.458333,0.461538
3,Compound 4,Nc1c(Cl)cc(Nc2ncnc3c(F)cccc23)cc1Cl,1.4,0.423077,0.42,0.510204


In [27]:
round(paper_5_hits_df['Train_NN_Tanimoto_ECFP4_2048'].mean(), 2)

0.49

In [28]:
round(paper_5_hits_df['Reported_NN_Tanimoto_ECFP4_2048'].mean(), 2)

0.48

In [29]:
round(paper_5_hits_df['ChEMBL_NN_Tanimoto_ECFP4_2048'].mean(), 2)

0.55

In [30]:
round(calculate_mean_pairwise_tanimoto(paper_5_hits_df), 2)

Pandas Apply:   0%|          | 0/4 [00:00<?, ?it/s]

0.42

# Paper 6 (10.48550/arXiv.2402.08210)

In [31]:
paper_6_hits_df = pd.read_csv('data/Paper 6/Paper 6 Hits.csv')
paper_6_train_df = pd.read_csv('data/Paper 6/Paper 6 Training Data.csv')
paper_6_chembl_df = pd.read_csv('data/Paper 6/Paper_6_Chembl.csv')


#### Data Cleanup and SMILES Standardization 

In [32]:
paper_6_hits_df = full_smiles_cleaner(paper_6_hits_df)
paper_6_train_df = full_smiles_cleaner(paper_6_train_df)
paper_6_chembl_df = full_smiles_cleaner(paper_6_chembl_df)

100%|██████████| 2/2 [00:00<00:00, 677.54it/s]
100%|██████████| 631/631 [00:00<00:00, 651.72it/s]
100%|██████████| 709/709 [00:01<00:00, 527.44it/s]


In [33]:
paper_6_hits_df = run_tanimoto_sim(paper_6_train_df, paper_6_hits_df, column_name="Train")
paper_6_hits_df = run_tanimoto_sim(paper_6_chembl_df, paper_6_hits_df, column_name="ChEMBL")


Pandas Apply:   0%|          | 0/638 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1755 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2 [00:00<?, ?it/s]

In [34]:
paper_6_hits_df

Unnamed: 0,Compound Name,SMILES,KRAS IC50 (µM),Train_NN_Tanimoto_ECFP4_2048,ChEMBL_NN_Tanimoto_ECFP4_2048
0,ISM061-018-2,Cc1ccc2[nH]ncc2c1C=Cc1cnc(Oc2cccc(F)c2)nc1,3.6 - 6.8,0.276596,0.27381
1,ISM061-22,COc1c(C)cccc1-c1nccc(Nc2cccc3c2ccn3C)n1,13.8 - 24.6,0.271739,0.195876


In [35]:
round(paper_6_hits_df['Train_NN_Tanimoto_ECFP4_2048'].mean(), 2)


0.27

In [36]:
round(paper_6_hits_df['ChEMBL_NN_Tanimoto_ECFP4_2048'].mean(), 2)

0.23

In [37]:
round(calculate_mean_pairwise_tanimoto(paper_6_hits_df), 2)

Pandas Apply:   0%|          | 0/2 [00:00<?, ?it/s]

0.15

# Paper 7 (10.1038/s42256-024-00809-7)

In [38]:
paper_7_hits_df = pd.read_csv('data/Paper 7/Paper 7 Hits.csv')
paper_7_train_df = pd.read_csv('data/Paper 7/Paper 7 Training Data.csv')
paper_7_chembl_df = pd.read_csv('data/Paper 7/Paper_7_Chembl.csv')


#### Data Cleanup and SMILES Standardization

In [39]:
paper_7_train_df['SMILES'] = list(paper_7_train_df['smiles'])
paper_7_chembl_df['SMILES'] = list(paper_7_chembl_df['smiles'])
paper_7_hits_df = full_smiles_cleaner(paper_7_hits_df)
paper_7_train_df = full_smiles_cleaner(paper_7_train_df)
paper_7_chembl_df = full_smiles_cleaner(paper_7_chembl_df)



100%|██████████| 6/6 [00:00<00:00, 707.64it/s]
100%|██████████| 13524/13524 [00:18<00:00, 735.99it/s]
100%|██████████| 1005/1005 [00:01<00:00, 514.11it/s]


In [40]:
paper_7_hits_df = run_tanimoto_sim(paper_7_train_df, paper_7_hits_df, column_name="Train")
paper_7_hits_df = run_tanimoto_sim(paper_7_chembl_df, paper_7_hits_df, column_name="ChEMBL")



Pandas Apply:   0%|          | 0/13524 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/6 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1005 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/6 [00:00<?, ?it/s]

In [41]:
paper_7_hits_df

Unnamed: 0,Compound Name,SMILES,A. baumannii ATCC 17978 MIC (µM),Train_NN_Tanimoto_ECFP4_2048,ChEMBL_NN_Tanimoto_ECFP4_2048
0,Enamine 28,O=S(=O)(NC(c1ccc(OC(F)(F)F)cc1)C(F)(F)F)c1cc(C...,≤ 14.9529593136725,0.322581,0.190476
1,Enamine 23,O=S(=O)(c1cc(F)ccc1NCc1ccccc1O)C(F)(F)F,≤ 22.9200369386517,0.333333,0.232143
2,Enamine 10,O=S(=O)(c1ccccc1NCc1cc(F)cc(F)c1O)C(F)(F)F,≤ 21.796574164041,0.288462,0.180851
3,Enamine 40,O=C(Nc1nc2ccc(S(F)(F)(F)(F)F)cc2s1)C(F)(F)c1cc...,≤ 13.5158590683902,0.342857,0.183908
4,Enamine 43,COc1cccc(C2(NCCCCCCSc3ccccc3OC(C)=O)CCCCC2)c1,≤ 17.572784798036,0.349206,0.189189
5,Enamine 31,Brc1cccc(CCCNC2CCc3cccc4cccc2c34)c1,≤ 21.1029698366025,0.345455,0.153846


In [42]:
round(paper_7_hits_df['Train_NN_Tanimoto_ECFP4_2048'].mean(), 2)

0.33

In [43]:
round(paper_7_hits_df['ChEMBL_NN_Tanimoto_ECFP4_2048'].mean(), 2)

0.19

In [44]:
round(calculate_mean_pairwise_tanimoto(paper_7_hits_df), 2)

Pandas Apply:   0%|          | 0/6 [00:00<?, ?it/s]

0.15

# Paper 9 (10.1021/acs.jmedchem.2c00931)

In [45]:
paper_9_hits_df = pd.read_csv('data/Paper 9/Paper 9 Hits.csv')
paper_9_chembl_df = pd.read_csv('data/Paper 9/Paper_9_Chembl.csv')

#### Data Cleanup and SMILES Standardization

In [46]:
paper_9_hits_df = full_smiles_cleaner(paper_9_hits_df)
paper_9_chembl_df = full_smiles_cleaner(paper_9_chembl_df)


100%|██████████| 1/1 [00:00<00:00, 644.78it/s]
100%|██████████| 1176/1176 [00:01<00:00, 749.73it/s]


In [47]:
paper_9_hits_df = run_tanimoto_sim(paper_9_chembl_df, paper_9_hits_df, column_name="ChEMBL")


Pandas Apply:   0%|          | 0/1818 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

In [48]:
paper_9_hits_df

Unnamed: 0,Compound Name,SMILES,CDK8 IC50 (µM),ChEMBL_NN_Tanimoto_ECFP4_2048
0,29,O=C(Nc1ccncc1)c1ccncc1,< 10,0.357143


In [49]:

round(paper_9_hits_df['ChEMBL_NN_Tanimoto_ECFP4_2048'].mean(), 2)

0.36