In [23]:
!pip install rdkit
!pip install rdkit-pypi

import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/compounds.csv", delimiter=";")
df.columns = df.columns.str.strip()

df = df.dropna(subset=["smiles"])
smiles_list = df["smiles"].tolist()

molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) for mol in molecules]

levodopa_smiles = "C1=CC(=C(C=C1CC(C(=O)O)N)O)O"
levodopa_mol = Chem.MolFromSmiles(levodopa_smiles)
levodopa_fp = AllChem.GetMorganFingerprintAsBitVect(levodopa_mol, 2, nBits=2048)

tanimoto_scores = []
for i, fp in enumerate(fingerprints):
    if fp is not None:
        similarity = DataStructs.TanimotoSimilarity(levodopa_fp, fp)
        tanimoto_scores.append((smiles_list[i], similarity))

tanimoto_scores.sort(key=lambda x: x[1], reverse=True)
top_hits = tanimoto_scores[:3]

print("Top 3 most similar compounds:")
for smiles, score in top_hits:
    print(f"SMILES: {smiles}, Similarity: {score:.4f}")

Top 3 most similar compounds:
SMILES: C1=CC(=C(C=C1CC(C(=O)O)N)O)O.[Na], Similarity: 0.9643
SMILES: COC1=C(C=C(C=C1)CC(C(=O)O)N)O, Similarity: 0.6667
SMILES: COC1=C(C=CC(=C1)CC(C(=O)O)N)O, Similarity: 0.6667
