**Note:**  
The similarity check of FDA active drugs and train data was done here.
Other comparisons are avilable here as well.

In [None]:
! pip install seaborn
! pip install matplotlib
! pip install rdkit
! pip install datamol
! pip install sklearn
! pip install scikit-fingerprints
! pip install shap
! pip install xgboost

Collecting rdkit
  Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.3.3
Collecting datamol
  Downloading datamol-0.12.5-py3-none-any.whl.metadata (8.0 kB)
Collecting loguru (from datamol)
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting selfies (from datamol)
  Downloading selfies-2.2.0-py3-none-any.whl.metadata (14 kB)
Downloading datamol-0.12.5-py3-none-any.whl (495 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m495.4/495.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading loguru-0.7.3-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[

In [None]:
# ------------------- SIMILARITY ANALYSIS BETWEEN FDA AND TRAIN ACTIVES -------------------

from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import pandas as pd
from tqdm import tqdm

# Load your original training dataset and FDA predictions
df_train = pd.read_csv("Dataset with selected features.csv")  # Make sure this path matches your actual one
df_fda = pd.read_csv("fda_predictions.csv")

# Filter only compounds with activity == 1 in train and predicted_activity == 1 in FDA
train_actives = df_train[df_train['activity'] == 1].copy()
fda_actives = df_fda[df_fda['predicted_activity'] == 1].copy()

# Convert SMILES to RDKit Mol objects and compute ECFP4 fingerprints (radius=2)
def get_ecfp4_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)

train_actives['fp'] = train_actives['Smiles'].apply(get_ecfp4_fingerprint)
fda_actives['fp'] = fda_actives['Smiles'].apply(get_ecfp4_fingerprint)

# Remove rows with invalid fingerprints
train_actives = train_actives[train_actives['fp'].notna()]
fda_actives = fda_actives[fda_actives['fp'].notna()]

# Compute Tanimoto similarity between every FDA active and every train active
similarities = []

print("Calculating Tanimoto similarity between FDA actives and train actives...")

for i, fda_row in tqdm(fda_actives.iterrows(), total=len(fda_actives)):
    for j, train_row in train_actives.iterrows():
        sim = DataStructs.TanimotoSimilarity(fda_row['fp'], train_row['fp'])
        similarities.append({
            'FDA_Name': fda_row['Name'],
            'FDA_SMILES': fda_row['Smiles'],
            'Train_Name': train_row['Name'],
            'Train_SMILES': train_row['Smiles'],
            'Similarity': sim
        })

# Create a DataFrame from similarity results
similarity_df = pd.DataFrame(similarities)

# Optional: Save to CSV
similarity_df.to_csv("FDA_vs_Train_similarity.csv", index=False)

# Show top 10 most similar pairs
similarity_df.sort_values(by='Similarity', ascending=False).head(10)




Calculating Tanimoto similarity between FDA actives and train actives...


100%|██████████| 82/82 [00:10<00:00,  8.10it/s]


Unnamed: 0,FDA_Name,FDA_SMILES,Train_Name,Train_SMILES,Similarity
19407,LAPATINIB DITOSYLATE,CS(=O)(=O)CCNCc1ccc(-c2ccc3ncnc(Nc4ccc(OCc5ccc...,CHEMBL409318,Cc1cc(C)c2oc(-c3ccc(NC(=O)COc4ccc(F)cc4C)cc3)n...,0.263636
35413,IVOSIDENIB,N#Cc1ccnc(N2C(=O)CC[C@H]2C(=O)N(c2cncc(F)c2)[C...,CHEMBL2152170,O=C(NC1CC(F)(F)C1)N[C@@](Cc1ccccc1)(c1cc(F)cc(...,0.252101
19607,LAPATINIB DITOSYLATE,CS(=O)(=O)CCNCc1ccc(-c2ccc3ncnc(Nc4ccc(OCc5ccc...,CHEMBL264963,Cc1cc(C)c2oc(-c3ccc(NC(=O)COc4ccc(Cl)c(C)c4C)c...,0.25
35717,IVOSIDENIB,N#Cc1ccnc(N2C(=O)CC[C@H]2C(=O)N(c2cncc(F)c2)[C...,CHEMBL1684864,COc1cccc(Cl)c1-c1ccc(N2CCC(CNC(=O)c3ccc(-c4nc5...,0.244275
35342,IVOSIDENIB,N#Cc1ccnc(N2C(=O)CC[C@H]2C(=O)N(c2cncc(F)c2)[C...,CHEMBL1684870,CC(C)c1ccccc1-c1ccc(N2CCC(CNC(=O)c3ccc(-c4nc5c...,0.243902
19585,LAPATINIB DITOSYLATE,CS(=O)(=O)CCNCc1ccc(-c2ccc3ncnc(Nc4ccc(OCc5ccc...,CHEMBL261502,Cc1c(Cl)ccc(OCC(=O)Nc2ccc(-c3nc4cc(C#N)ccc4o3)...,0.241379
35577,IVOSIDENIB,N#Cc1ccnc(N2C(=O)CC[C@H]2C(=O)N(c2cncc(F)c2)[C...,CHEMBL1684861,CC(C)c1cc(C#N)cc2nc(-c3ccc(C(=O)NCC4CCN(c5ccc(...,0.235772
35425,IVOSIDENIB,N#Cc1ccnc(N2C(=O)CC[C@H]2C(=O)N(c2cncc(F)c2)[C...,CHEMBL2165707,N#Cc1ccc([C@@](Cc2ccccc2)(NC(=O)NC2CCCC2)c2cc(...,0.234783
35378,IVOSIDENIB,N#Cc1ccnc(N2C(=O)CC[C@H]2C(=O)N(c2cncc(F)c2)[C...,CHEMBL2152166,O=C(NC1CCC(F)(F)C1)N[C@@](Cc1ccccc1)(c1cc(F)cc...,0.233871
35458,IVOSIDENIB,N#Cc1ccnc(N2C(=O)CC[C@H]2C(=O)N(c2cncc(F)c2)[C...,CHEMBL2152167,O=C(N[C@@H]1CCC(F)(F)C1)N[C@@](Cc1ccccc1)(c1cc...,0.233871


In [None]:
# For each FDA active, compute max and average similarity to all training compounds (not just actives)

fda_summary = []

for i, fda_row in tqdm(fda_actives.iterrows(), total=len(fda_actives)):
    similarities = [
        DataStructs.TanimotoSimilarity(fda_row['fp'], train_fp)
        for train_fp in train_actives['fp']
    ]
    fda_summary.append({
        'Name': fda_row['Name'],
        'SMILES': fda_row['Smiles'],
        'Max_Similarity': max(similarities),
        'Mean_Similarity': sum(similarities) / len(similarities),
        'Predicted_Prob': fda_row['probability_active']
    })

fda_similarity_df = pd.DataFrame(fda_summary)

# Optional: Save to CSV
fda_similarity_df.to_csv("similarity_to_all_df.csv", index=False)

fda_similarity_df.sort_values(by='Max_Similarity', ascending=False).head(10)


100%|██████████| 82/82 [00:00<00:00, 360.92it/s]


Unnamed: 0,Name,SMILES,Max_Similarity,Mean_Similarity,Predicted_Prob
41,LAPATINIB DITOSYLATE,CS(=O)(=O)CCNCc1ccc(-c2ccc3ncnc(Nc4ccc(OCc5ccc...,0.263636,0.12143,0.643776
75,IVOSIDENIB,N#Cc1ccnc(N2C(=O)CC[C@H]2C(=O)N(c2cncc(F)c2)[C...,0.252101,0.132264,0.515909
55,AZILSARTAN KAMEDOXOMIL,CCOc1nc2cccc(C(=O)OCc3oc(=O)oc3C)c2n1Cc1ccc(-c...,0.228571,0.127725,0.50673
74,UMBRALISIB TOSYLATE,CC(C)Oc1ccc(-c2nn([C@@H](C)c3oc4ccc(F)cc4c(=O)...,0.218978,0.126691,0.537234
35,OLMESARTAN MEDOXOMIL,CCCc1nc(C(C)(C)O)c(C(=O)OCc2oc(=O)oc2C)n1Cc1cc...,0.214286,0.132277,0.643186
21,VORAPAXAR,CCOC(=O)N[C@@H]1CC[C@@H]2[C@@H](C1)C[C@H]1C(=O...,0.213115,0.150722,0.614083
66,VENETOCLAX,CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,0.212766,0.134615,0.802252
70,OTESECONAZOLE,O[C@@](Cn1cnnn1)(c1ccc(F)cc1F)C(F)(F)c1ccc(-c2...,0.211111,0.136184,0.508387
65,ELBASVIR,COC(=O)N[C@H](C(=O)N1CCC[C@H]1c1nc(-c2ccc3c(c2...,0.204918,0.14056,0.767688
22,TRABECTEDIN,COc1cc2c(cc1O)CCN[C@]21CS[C@@H]2c3c(OC(C)=O)c(...,0.203125,0.107064,0.522456


In [None]:
# ------------------ Applicability Domain Filter for FDA Hits ------------------

from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from tqdm import tqdm
import pandas as pd

# Load train data (known labels) and FDA predictions
df_train = pd.read_csv("Dataset with selected features.csv")
df_fda = pd.read_csv("fda_predictions.csv")

# Filter training set to only ACTIVE compounds
train_actives = df_train[df_train['activity'] == 1].copy()

# Convert SMILES to RDKit mols and compute ECFP4 fingerprints
def compute_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)

train_actives['fp'] = train_actives['Smiles'].apply(compute_fingerprint)
df_fda['fp'] = df_fda['Smiles'].apply(compute_fingerprint)

# Drop invalid SMILES rows
train_actives = train_actives[train_actives['fp'].notna()]
df_fda = df_fda[df_fda['fp'].notna()]

# Initialize list to collect results
ad_results = []

print("Calculating Applicability Domain similarity metrics...")

# For each FDA compound, calculate max and mean similarity to known actives
for i, row in tqdm(df_fda.iterrows(), total=len(df_fda)):
    similarities = [
        DataStructs.TanimotoSimilarity(row['fp'], train_fp)
        for train_fp in train_actives['fp']
    ]

    ad_results.append({
        'Name': row['Name'],
        'Smiles': row['Smiles'],
        'Predicted_Activity': row['predicted_activity'],
        'Prob_Active': row['probability_active'],
        'Max_Similarity': max(similarities),
        'Mean_Similarity': sum(similarities) / len(similarities)
    })

# Convert to DataFrame
ad_df = pd.DataFrame(ad_results)

# Filter by threshold: active prediction with acceptable similarity
filtered_hits = ad_df[
    (ad_df['Prob_Active'] > 0.8) &
    (ad_df['Max_Similarity'] > 0.4)
].sort_values(by='Prob_Active', ascending=False)

# Save filtered candidates
filtered_hits.to_csv("fda_hits_applicability_filtered.csv", index=False)

# Display top 10
filtered_hits.head(10)




Calculating Applicability Domain similarity metrics...


100%|██████████| 2620/2620 [00:14<00:00, 184.89it/s]


Unnamed: 0,Name,Smiles,Predicted_Activity,Prob_Active,Max_Similarity,Mean_Similarity


In [None]:
# Checking similarity among all train data and all FDA drugs to find out if my base train dataset is similar to any drug or not!

from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import pandas as pd
from tqdm import tqdm

# Load your original training dataset which includes FDA data
df_all = pd.read_csv("Dataset with selected features.csv")

# Split the data based on the 'dataset' column
df_train = df_all[df_all['dataset'].isin(['train', 'test'])].copy()
df_fda = df_all[df_all['dataset'] == 'FDA'].copy()

# Convert SMILES to RDKit Mol objects and compute ECFP4 fingerprints (radius=2)
def get_ecfp4_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)

df_train['fp'] = df_train['Smiles'].apply(get_ecfp4_fingerprint)
df_fda['fp'] = df_fda['Smiles'].apply(get_ecfp4_fingerprint)

# Remove rows with invalid fingerprints
df_train = df_train[df_train['fp'].notna()]
df_fda = df_fda[df_fda['fp'].notna()]

# Compute Tanimoto similarity between every FDA compound and every train/test compound
similarities_all = []

print("Calculating Tanimoto similarity between FDA compounds (from df_all) and train/test compounds (from df_all)...")

for i, fda_row in tqdm(df_fda.iterrows(), total=len(df_fda)):
    for j, train_row in df_train.iterrows():
        sim = DataStructs.TanimotoSimilarity(fda_row['fp'], train_row['fp'])
        similarities_all.append({
            'FDA_Name': fda_row['Name'],
            'FDA_SMILES': fda_row['Smiles'],
            'Train_Name': train_row['Name'],
            'Train_SMILES': train_row['Smiles'],
            'Similarity': sim
        })

# Create a DataFrame from similarity results
similarity_all_df = pd.DataFrame(similarities_all)

# Optional: Save to CSV
similarity_all_df.to_csv("FDA_vs_TrainTest_similarity_from_df_all.csv", index=False)

# Show top 10 most similar pairs
display(similarity_all_df.sort_values(by='Similarity', ascending=False).head(10))



Calculating Tanimoto similarity between FDA compounds (from df_all) and train/test compounds (from df_all)...


100%|██████████| 2628/2628 [05:16<00:00,  8.31it/s]


Unnamed: 0,FDA_Name,FDA_SMILES,Train_Name,Train_SMILES,Similarity
363861,MILTEFOSINE,CCCCCCCCCCCCCCCCOP(=O)([O-])OCC[N+](C)(C)C,CHEMBL1481114,CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)COP(=O)([O...,0.541667
1027732,BENOXAPROFEN,CC(C(=O)O)c1ccc2oc(-c3ccc(Cl)cc3)nc2c1,CHEMBL597335,CC(Oc1ccccc1)C(=O)Nc1ccc(-c2nc3cc(Cl)ccc3o2)cc1,0.516667
2165297,COLFOSCERIL PALMITATE,CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[N...,CHEMBL1481114,CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)COP(=O)([O...,0.491803
1027483,BENOXAPROFEN,CC(C(=O)O)c1ccc2oc(-c3ccc(Cl)cc3)nc2c1,CHEMBL598141,O=C(CSc1ccccc1)Nc1ccc(-c2nc3cc(Cl)ccc3o2)cc1,0.474576
1026900,BENOXAPROFEN,CC(C(=O)O)c1ccc2oc(-c3ccc(Cl)cc3)nc2c1,CHEMBL596917,O=C(CNc1ccccc1)Nc1ccc(-c2nc3cc(Cl)ccc3o2)cc1,0.474576
1026938,BENOXAPROFEN,CC(C(=O)O)c1ccc2oc(-c3ccc(Cl)cc3)nc2c1,CHEMBL598254,O=C(COc1ccc(Cl)cc1)Nc1ccc(-c2nc3cc(Cl)ccc3o2)cc1,0.474576
1026962,BENOXAPROFEN,CC(C(=O)O)c1ccc2oc(-c3ccc(Cl)cc3)nc2c1,CHEMBL598765,O=C(NCc1ccccc1)Nc1ccc(-c2nc3cc(Cl)ccc3o2)cc1,0.466667
1026877,BENOXAPROFEN,CC(C(=O)O)c1ccc2oc(-c3ccc(Cl)cc3)nc2c1,CHEMBL611987,O=C(CCc1ccccc1)Nc1ccc(-c2nc3cc(Cl)ccc3o2)cc1,0.466667
1459676,DOPAMINE HYDROCHLORIDE,Cl.NCCc1ccc(O)c(O)c1,CHEMBL291507,Oc1ccc(Cc2ccccc2)cc1O,0.466667
896904,MASOPROCOL,C[C@H](Cc1ccc(O)c(O)c1)[C@@H](C)Cc1ccc(O)c(O)c1,CHEMBL291507,Oc1ccc(Cc2ccccc2)cc1O,0.466667


In [None]:
display(similarity_all_df.sort_values(by='Similarity', ascending=False).head(20))

Unnamed: 0,FDA_Name,FDA_SMILES,Train_Name,Train_SMILES,Similarity
363861,MILTEFOSINE,CCCCCCCCCCCCCCCCOP(=O)([O-])OCC[N+](C)(C)C,CHEMBL1481114,CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)COP(=O)([O...,0.541667
1027732,BENOXAPROFEN,CC(C(=O)O)c1ccc2oc(-c3ccc(Cl)cc3)nc2c1,CHEMBL597335,CC(Oc1ccccc1)C(=O)Nc1ccc(-c2nc3cc(Cl)ccc3o2)cc1,0.516667
2165297,COLFOSCERIL PALMITATE,CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[N...,CHEMBL1481114,CCCCCCCCCCCCC/C=C/[C@@H](O)[C@@H](N)COP(=O)([O...,0.491803
1027483,BENOXAPROFEN,CC(C(=O)O)c1ccc2oc(-c3ccc(Cl)cc3)nc2c1,CHEMBL598141,O=C(CSc1ccccc1)Nc1ccc(-c2nc3cc(Cl)ccc3o2)cc1,0.474576
1026900,BENOXAPROFEN,CC(C(=O)O)c1ccc2oc(-c3ccc(Cl)cc3)nc2c1,CHEMBL596917,O=C(CNc1ccccc1)Nc1ccc(-c2nc3cc(Cl)ccc3o2)cc1,0.474576
1026938,BENOXAPROFEN,CC(C(=O)O)c1ccc2oc(-c3ccc(Cl)cc3)nc2c1,CHEMBL598254,O=C(COc1ccc(Cl)cc1)Nc1ccc(-c2nc3cc(Cl)ccc3o2)cc1,0.474576
1026962,BENOXAPROFEN,CC(C(=O)O)c1ccc2oc(-c3ccc(Cl)cc3)nc2c1,CHEMBL598765,O=C(NCc1ccccc1)Nc1ccc(-c2nc3cc(Cl)ccc3o2)cc1,0.466667
1026877,BENOXAPROFEN,CC(C(=O)O)c1ccc2oc(-c3ccc(Cl)cc3)nc2c1,CHEMBL611987,O=C(CCc1ccccc1)Nc1ccc(-c2nc3cc(Cl)ccc3o2)cc1,0.466667
1459676,DOPAMINE HYDROCHLORIDE,Cl.NCCc1ccc(O)c(O)c1,CHEMBL291507,Oc1ccc(Cc2ccccc2)cc1O,0.466667
896904,MASOPROCOL,C[C@H](Cc1ccc(O)c(O)c1)[C@@H](C)Cc1ccc(O)c(O)c1,CHEMBL291507,Oc1ccc(Cc2ccccc2)cc1O,0.466667
