In [None]:
!pip install rdkit-pypi
!pip install rdkit-pypi --upgrade

!pip install "numpy<2.0"
!pip install Pillow

import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import MACCSkeys
import csv
from tqdm import tqdm

In [1]:
def calculate_similarity_maccs(smiles1, smiles2):
    """Calculate Tanimoto similarity between two SMILES strings using MACCS keys"""
    try:
        mol1 = Chem.MolFromSmiles(smiles1)
        mol2 = Chem.MolFromSmiles(smiles2)
        if mol1 is None or mol2 is None:
            return 0.0
        fp1 = MACCSkeys.GenMACCSKeys(mol1)
        fp2 = MACCSkeys.GenMACCSKeys(mol2)
        return DataStructs.TanimotoSimilarity(fp1, fp2)
    except:
        return 0.0


df_smiles = pd.read_csv('generated_molecules.csv')
generated_smiles = df_smiles['SMILES'].tolist()


df_original = pd.read_csv('enumerated_smiles.csv')
original_smiles = df_original['Enumerated_SMILES'].tolist()

# Calculate similarities for each generated molecule
results = []
for gen_smile in tqdm(generated_smiles, desc="Calculating similarities"):
    mol = Chem.MolFromSmiles(gen_smile)
    max_similarity = 0.0
    most_similar_smile = ""

    # Find the maximum similarity with any molecule in the original database
    for orig_smile in original_smiles:
        sim = calculate_similarity_maccs(gen_smile, orig_smile)
        if sim > max_similarity:
            max_similarity = sim
            most_similar_smile = orig_smile

    results.append({
        'SMILES': gen_smile,
        'mol': mol,
        'ts_index': max_similarity,
        'most_similar_original': most_similar_smile
    })


df_results = pd.DataFrame(results)
df_results.to_csv('tanimoto_similarities.csv', index=False)

# Print summary statistics
print("\nSummary Statistics:")
print(f"Average Tanimoto similarity: {df_results['ts_index'].mean():.3f}")
print(f"Maximum Tanimoto similarity: {df_results['ts_index'].max():.3f}")
print(f"Minimum Tanimoto similarity: {df_results['ts_index'].min():.3f}")

# Print top 5 most similar pairs
print("\nTop 5 most similar pairs:")
top_5 = df_results.nlargest(5, 'ts_index')
for _, row in top_5.iterrows():
    print(f"Generated: {row['SMILES']}")
    print(f"Original:  {row['most_similar_original']}")
    print(f"Similarity: {row['ts_index']:.3f}\n")

print("\nResults have been saved to 'tanimoto_similarities.csv'")



Calculating similarities: 100%|██████████| 101/101 [50:43<00:00, 30.14s/it]


Summary Statistics:
Average Tanimoto similarity: 0.610
Maximum Tanimoto similarity: 0.842
Minimum Tanimoto similarity: 0.431

Top 5 most similar pairs:
Generated: Cc1nc2c(c(=O)n1C1CCNC1)C=C2
Original:  Cc1nc2ccccc2c(=O)n1C1CCNCC1
Similarity: 0.842

Generated: CC1=Cc2nc(C)n(C3CCNC3)c(=O)c21
Original:  Cc1ccc2nc(C)n(C3CCNCC3)c(=O)c2c1
Similarity: 0.817

Generated: CC1CCN(C)c2cccn2-c2nc1cs2
Original:  Cc1cccc2nc(N3CCCN(C)CC3)sc12
Similarity: 0.776

Generated: CCC1CCC(C)CN(CC)C1
Original:  Cc1cccc(C2CCCCN(C)C2)c1
Similarity: 0.742

Generated: CN1C=CCN2CCC(CCC=CC1=O)C2
Original:  CN1CCC(N2CCCC2=O)CC1
Similarity: 0.739


Results have been saved to 'tanimoto_similarities.csv'



