In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances

In [None]:
# Function to calculate the Tanimoto similarity matrix
def calculate_tanimoto_similarity(smiles_list):
    fingerprints = [
        AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile), 2, nBits=2048)
        for smile in smiles_list
    ]
    num_fps = len(fingerprints)
    similarity_matrix = np.zeros((num_fps, num_fps))
    for i in range(num_fps):
        for j in range(i + 1, num_fps):
            similarity = DataStructs.FingerprintSimilarity(
                fingerprints[i], fingerprints[j]
            )
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity
    return similarity_matrix


# Function to perform hierarchical clustering and get the top N similar molecules
def hierarchical_clustering(smiles_list, top_n=9):
    similarity_matrix = calculate_tanimoto_similarity(smiles_list)
    distance_matrix = 1 - similarity_matrix
    Z = linkage(pairwise_distances(distance_matrix), method="ward")
    clusters = fcluster(Z, t=top_n, criterion="maxclust")

    # Get top N similar molecules
    clustered_smiles = pd.DataFrame({"smiles": smiles_list, "cluster": clusters})
    top_n_clusters = (
        clustered_smiles["cluster"].value_counts().nlargest(top_n).index.tolist()
    )
    top_n_smiles = clustered_smiles[clustered_smiles["cluster"].isin(top_n_clusters)]

    return top_n_smiles, Z

In [None]:
# Example usage with 100 SMILES strings
smiles_list = [
    "list_of_100_smiles_strings"
]  # Replace with actual list of 100 SMILES strings
top_n_smiles, Z = hierarchical_clustering(smiles_list, top_n=9)

# Plot dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z, labels=smiles_list, leaf_rotation=90)
plt.show()

# Display top 9 clusters
print(top_n_smiles)