# Today I have been looking at some basic clustering of molecules.

## Todo:
1. See if we can cluster by things other than fingerprints
2. Make the pandas data handling a bit nicer
3. Consider what the topology is actually meaning here



In [3]:
import numpy as np
import sklearn
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
import rdkit.Chem.Fingerprints.ClusterMols
from IPython.display import SVG, IFrame
import gzip
import os
import pickle
import pandas as pd
import kmapper as km
from kmapper import jupyter
from sklearn import cluster

In [4]:
with open("../data/processed/curated_set_with_publication_year.pd.pkl", "rb") as infile:
    df = pickle.load(infile)

In [8]:
from collections import Counter
possible_targets = Counter([item for item in df["TGT_CHEMBL_ID"]])
print(len(possible_targets))
print(len(df))
print(possible_targets)
first_target = df["TGT_CHEMBL_ID"] == "CHEMBL4336"
sub_df = df[first_target]

1227
314767
Counter({'CHEMBL240': 4703, 'CHEMBL253': 3472, 'CHEMBL218': 2997, 'CHEMBL251': 2976, 'CHEMBL228': 2853, 'CHEMBL264': 2548, 'CHEMBL226': 2544, 'CHEMBL217': 2473, 'CHEMBL344': 2358, 'CHEMBL243': 2315, 'CHEMBL256': 2304, 'CHEMBL205': 2257, 'CHEMBL279': 2142, 'CHEMBL261': 2089, 'CHEMBL4235': 2020, 'CHEMBL244': 2010, 'CHEMBL222': 2003, 'CHEMBL233': 1998, 'CHEMBL4078': 1994, 'CHEMBL284': 1950, 'CHEMBL237': 1908, 'CHEMBL259': 1828, 'CHEMBL4822': 1799, 'CHEMBL3371': 1773, 'CHEMBL214': 1703, 'CHEMBL313': 1690, 'CHEMBL3594': 1678, 'CHEMBL203': 1659, 'CHEMBL224': 1643, 'CHEMBL4296': 1594, 'CHEMBL260': 1589, 'CHEMBL235': 1575, 'CHEMBL234': 1569, 'CHEMBL225': 1565, 'CHEMBL236': 1550, 'CHEMBL220': 1542, 'CHEMBL238': 1518, 'CHEMBL247': 1474, 'CHEMBL255': 1445, 'CHEMBL3952': 1424, 'CHEMBL2039': 1403, 'CHEMBL340': 1386, 'CHEMBL3242': 1380, 'CHEMBL204': 1347, 'CHEMBL5071': 1332, 'CHEMBL239': 1324, 'CHEMBL325': 1298, 'CHEMBL5763': 1282, 'CHEMBL2034': 1258, 'CHEMBL4015': 1234, 'CHEMBL2409': 12

In [48]:
fingerprint_data = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles),3) for smiles in sub_df["SMILES"]]
try:
    sub_df.insert(0, "FINGERPRINT",fingerprint_data)
except ValueError:
    sub_df.loc["FINGERPRINT"] = fingerprint_data

In [49]:
sub_df

Unnamed: 0,FINGERPRINT,BIOACT_PCHEMBL_VALUE,CMP_ACD_LOGD,CMP_ACD_LOGP,CMP_ALOGP,CMP_AROMATIC_RINGS,CMP_CHEMBL_ID,CMP_FULL_MWT,CMP_HBA,CMP_HBD,...,CMP_STANDARD_INCHI_KEY,CMP_STRUCTURE_TYPE,CMP_TYPE_PROTEIN,CMP_TYPE_SMALL_MOLECULE,DOC_YEAR,SMILES,TC_key,TGT_CHEMBL_ID,TGT_ORGANISM,TGT_TID
CHEMBL4336 - CHEMBL439934,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",7.030,-0.55,2.06,3.39,1,CHEMBL439934,420.51,5,3,...,IGRHMJRVBYYCFZ-NGTZFMKBSA-N,MOL,False,True,2001,COCc1cccc(C[C@H](O)/C=C/[C@H]2[C@H](O)C[C@@H](...,CHEMBL4336 - CHEMBL439934,CHEMBL4336,Mus musculus,12507
CHEMBL4336 - CHEMBL1095767,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9.060,2.04,4.86,6.77,3,CHEMBL1095767,498.61,5,2,...,RTNUVYOOKJGYIN-GDLZYMKVSA-N,MOL,False,True,2010,Cc1cc(C)cc([C@@H](CC(C)C)NC(=O)c2cc(COc3cccc(C...,CHEMBL4336 - CHEMBL1095767,CHEMBL4336,Mus musculus,12507
CHEMBL4336 - CHEMBL1094161,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9.485,2.37,5.20,7.30,3,CHEMBL1094161,509.58,4,2,...,WJWSOMYLMUTCOD-HHHXNRCGSA-N,MOL,False,True,2010,Cc1cc(C)cc([C@@H](CC(C)C)NC(=O)c2cc(COc3cc(F)c...,CHEMBL4336 - CHEMBL1094161,CHEMBL4336,Mus musculus,12507
CHEMBL4336 - CHEMBL595159,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",6.460,0.68,4.42,5.08,4,CHEMBL595159,396.44,4,1,...,CUWDWCVTMUTFJT-UHFFFAOYSA-N,MOL,False,True,2009,O=C(O)C#Cc1ccc(Cn2cccn2)cc1OCCc1ccc2ccccc2c1,CHEMBL4336 - CHEMBL595159,CHEMBL4336,Mus musculus,12507
CHEMBL4336 - CHEMBL592056,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6.960,-0.72,2.09,4.31,3,CHEMBL592056,389.44,4,2,...,LCYAQTLRAKBEIB-UHFFFAOYSA-N,MOL,False,True,2010,O=C(O)CCc1ccc(COc2ccccc2)cc1C(=O)NCc1ccccc1,CHEMBL4336 - CHEMBL592056,CHEMBL4336,Mus musculus,12507
CHEMBL4336 - CHEMBL593764,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",5.800,2.55,5.43,5.59,2,CHEMBL593764,396.48,5,2,...,FFXXDYJVLLGASF-MDZDMXLPSA-N,MOL,False,True,2009,Cc1c(C)c2c(c(C)c1O)CCC(C)(CCOc1ccccc1/C=C/C(=O...,CHEMBL4336 - CHEMBL593764,CHEMBL4336,Mus musculus,12507
CHEMBL4336 - CHEMBL599810,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9.050,5.41,5.41,7.56,3,CHEMBL599810,508.05,4,2,...,VSONIJBYVAGXHF-UHFFFAOYSA-N,MOL,False,True,2010,Cc1cc(C)cc(C(CC(C)C)NC(=O)c2cc(COc3cccc(Cl)c3)...,CHEMBL4336 - CHEMBL599810,CHEMBL4336,Mus musculus,12507
CHEMBL4336 - CHEMBL598174,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8.960,2.06,4.75,6.00,3,CHEMBL598174,488.55,5,2,...,QJALELWCHCKRGI-UHFFFAOYSA-N,MOL,False,True,2010,CC(C)CC(NC(=O)c1cc(COc2cccc(C#N)c2)ccc1CCC(=O)...,CHEMBL4336 - CHEMBL598174,CHEMBL4336,Mus musculus,12507
CHEMBL4336 - CHEMBL603633,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",6.090,-1.04,2.58,4.33,4,CHEMBL603633,402.44,5,1,...,JUZWEMPFFFBUCL-UHFFFAOYSA-N,MOL,False,True,2009,O=C(O)COc1ccc(Cn2cccn2)cc1OCCc1ccc2ccccc2c1,CHEMBL4336 - CHEMBL603633,CHEMBL4336,Mus musculus,12507
CHEMBL4336 - CHEMBL598995,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8.245,1.44,4.25,6.40,3,CHEMBL598995,459.58,4,2,...,CDBREVYLTWFNCI-UHFFFAOYSA-N,MOL,False,True,2010,Cc1ccc(C(CC(C)C)NC(=O)c2cc(COc3ccccc3)ccc2CCC(...,CHEMBL4336 - CHEMBL598995,CHEMBL4336,Mus musculus,12507


In [50]:
fingerprint_data = []
for index, series in sub_df.iterrows():
    fingerprint_data.append((series["CMP_CHEMBL_ID"], series["FINGERPRINT"]))
len(fingerprint_data)


199

In [51]:
def GetDistanceMatrix(data,metric,isSimilarity=1):
    """
    Adapted from rdkit, because their implementation has a bug
    in it (it relies on Python 2 doing integer division by default).
    It is also poorly documented. Metric is a function
    that returns the 'distance' between points 1 and 2.
    
    Data should be a list of tuples with fingerprints in position 1
    (the rest of the elements of the tuple are not important)

    Returns the symmetric distance matrix.
    (see ML.Cluster.Resemblance for layout documentation)
    """
    nPts = len(data)
    num_pairs = int(nPts*(nPts-1)/2)
    res = np.zeros(num_pairs ,np.float)
    print(res)
    nSoFar=0
    for col in range(1,nPts):
        for row in range(col):
            fp1 = data[col][1]
            fp2 = data[row][1]
            if fp1.GetNumBits()>fp2.GetNumBits():
                fp1 = DataStructs.FoldFingerprint(fp1,fp1.GetNumBits()/fp2.GetNumBits())
            elif fp2.GetNumBits()>fp1.GetNumBits():
                fp2 = DataStructs.FoldFingerprint(fp2,fp2.GetNumBits()/fp1.GetNumBits())
            sim = metric(fp1,fp2)
            if isSimilarity:
                sim = 1.-sim
            res[nSoFar] = sim
            nSoFar += 1
    return res   

In [52]:
distance_matrix = GetDistanceMatrix(fingerprint_data, metric=rdkit.DataStructs.TanimotoSimilarity)
distance_matrix

[0. 0. 0. ... 0. 0. 0.]


array([0.85810811, 0.85517241, 0.40186916, ..., 0.26732673, 0.36633663,
       0.26530612])

Now we need to mangle this flat distance matrix into a sane square one.
The indices of $(\text{row}, \text{col})$ are at $\frac{(\text{col}\times(\text{col}-1))}{2} + \text{row} $
in the flat matrix.

In [53]:
sq_distance_matrix = np.empty([len(fingerprint_data), len(fingerprint_data)])
for row in range(len(fingerprint_data)):
    for col in range(row + 1):
        index = int((col * (col - 1)) / 2) + row
        if row == col:
            sq_distance_matrix[row, col] = 0.0
        else:
            sq_distance_matrix[row, col] = distance_matrix[index]
            sq_distance_matrix[col, row] = distance_matrix[index]

In [73]:
numerical_cols = [sub_df.columns[pos] for pos, item in enumerate(sub_df.dtypes) if item in [np.float64, np.int64]]
new_data = sub_df[numerical_cols].to_numpy()
dimensional_data = np.array([row[0] for row in new_data])
print(dimensional_data)
mapper = km.KeplerMapper(verbose=1)
graph = mapper.map(dimensional_data, X=sq_distance_matrix, precomputed=True, cover=km.Cover(n_cubes=35, perc_overlap=0.2), clusterer=sklearn.cluster.DBSCAN(algorithm='auto', eps=0.40, leaf_size=30, metric='precomputed', min_samples=3, n_jobs=4))

[7.03  9.06  9.485 6.46  6.96  5.8   9.05  8.96  6.09  8.245 9.13  8.215
 6.92  7.04  8.7   8.285 8.55  8.6   8.465 8.455 8.96  7.835 6.35  8.745
 6.415 6.035 8.945 5.54  6.875 7.41  7.53  7.25  7.07  9.27  8.075 8.21
 7.855 8.125 8.835 7.81  6.28  8.3   9.125 8.8   9.085 8.485 5.82  8.435
 8.365 8.86  8.785 8.365 7.66  8.42  6.68  9.075 8.365 6.11  6.775 8.805
 9.01  9.09  9.17  7.87  7.55  7.55  8.73  9.17  6.85  7.66  8.635 8.515
 8.3   7.59  8.72  8.745 9.165 6.645 6.85  8.785 8.94  8.54  6.06  8.61
 9.125 8.4   8.685 8.875 8.92  8.35  8.7   8.97  8.615 8.62  7.96  8.585
 8.305 7.82  8.335 6.87  5.945 9.065 8.57  8.16  8.4   6.955 9.495 8.965
 9.37  5.77  7.91  8.15  9.17  8.81  8.43  9.04  9.    8.795 5.66  8.935
 6.71  7.28  6.82  8.535 9.03  8.825 8.77  9.19  8.655 8.06  8.855 8.595
 7.25  6.13  8.73  9.6   9.045 8.985 8.46  7.48  6.98  7.765 9.22  6.475
 8.41  8.88  8.43  8.855 7.545 6.6   8.475 7.76  5.49  5.44  5.55  5.77
 5.55  5.77  5.6   6.68  6.15  5.72  5.92  7.41  6.01 

In [74]:
# Visualize it
mapper.visualize(graph, path_html="map-dataframe-test.html",
                 title="Map Dataframe Test", color_function=dimensional_data)
IFrame("map-dataframe-test.html", 800, 600)

Wrote visualization to: map-dataframe-test.html


How do we actually extract meaningful data from this list? Time to visualise it!

In [56]:
mols = [Chem.MolFromSmiles(sub_df.iloc[i]["SMILES"]) for i in graph["nodes"]["cube2_cluster0"]]
from rdkit.Chem import rdFMCS
res =rdFMCS.FindMCS(mols)
newmol = Chem.MolFromSmarts(res.smartsString)

In [57]:
def draw_molecule(molec, molsize, highlight_atoms=None):
    rdDepictor.Compute2DCoords(molec)
    drawer = rdMolDraw2D.MolDraw2DSVG(molsize[0], molsize[1], highlight_atoms=highlight_atoms)
    drawer.DrawMolecule(molec)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    display(SVG(svg.replace("svg:", "")))

In [None]:
for index, node in enumerate(graph["nodes"]):
    mols = [Chem.MolFromSmiles(sub_df.iloc[i]["SMILES"]) for i in graph["nodes"][node]]
    mean_bioactivity = np.mean([sub_df.iloc[i]["BIOACT_PCHEMBL_VALUE"] for i in graph["nodes"][node]])
    if len(mols) > 1:
        max_substructure = rdFMCS.FindMCS(mols, ringMatchesRingOnly=True).smartsString
        mol_smarts = Chem.MolFromSmarts(max_substructure)
        highlight_list = [mol.GetSubstructMatches(mol_smarts)[0] for mol in mols]
        print(node, mean_bioactivity)
        display(SVG(Chem.Draw._MolsToGridSVG(mols, highlightAtomLists=highlight_list)))

In [59]:
print(graph)

{'nodes': defaultdict(<class 'list'>, {'cube0_cluster0': [27, 118, 156, 169, 171, 178, 184, 186], 'cube1_cluster0': [5, 27, 46, 100, 109, 118, 152, 153, 155, 156, 157, 158, 161, 162, 167, 168, 169, 171, 174, 176, 177, 181, 183, 184, 186, 194], 'cube2_cluster0': [5, 8, 25, 40, 46, 57, 82, 100, 109, 118, 133, 155, 157, 158, 160, 161, 162, 164, 166, 167, 168, 169, 172, 174, 176, 177, 179, 181, 183, 184, 185, 191, 193, 194], 'cube3_cluster0': [3, 8, 22, 24, 25, 40, 57, 82, 100, 133, 143, 160, 162, 164, 166, 179, 181, 182, 187, 189, 191, 193], 'cube4_cluster0': [3, 22, 24, 28, 40, 54, 58, 68, 77, 78, 99, 120, 122, 143, 149, 159, 160, 166, 180, 182, 187, 189, 191, 193, 196, 198], 'cube5_cluster0': [0, 3, 4, 12, 13, 28, 32, 54, 58, 68, 77, 78, 99, 105, 120, 122, 140, 143, 149, 159, 165, 180, 182, 189, 196, 197, 198], 'cube6_cluster0': [0, 4, 12, 13, 28, 29, 31, 32, 58, 68, 78, 99, 105, 121, 122, 132, 140, 163, 165, 173, 188, 195, 197], 'cube7_cluster0': [0, 13, 29, 30, 31, 32, 52, 64, 65, 69,

The theory goes that these clusters are linked by specific molecules. If they are linked, perhaps we should
look at the maximum common substructure of the linking atom within each cluster.

In [None]:
cm = [(1,0,0), (0, 1, 0), (0, 0, 1), (1, 1, 0), (0, 1, 1), (1, 0, 1)]
for node in graph["nodes"]:
    for othernode in graph["nodes"]:
        if node == othernode:
            break
        
        intersection = set(graph["nodes"][node]).intersection(graph["nodes"][node])
        print(node, "and", othernode," are linked by", intersection)
        for shared_mol in intersection:
            cmap_index = 0
            shared_mol_smiles = Chem.MolFromSmiles(sub_df.iloc[shared_mol]["SMILES"])
            mols = [Chem.MolFromSmiles(sub_df.iloc[i]["SMILES"]) for i in graph["nodes"][node]]
            highlight_list = []
            total_shared = set()
            for mol in graph["nodes"][node]:
                mol_smiles = Chem.MolFromSmiles(sub_df.iloc[mol]["SMILES"])
                max_substructure = rdFMCS.FindMCS([mol_smiles, shared_mol_smiles], ringMatchesRingOnly=True).smartsString
                mol_smarts = Chem.MolFromSmarts(max_substructure)
                matching_atoms = mol_smiles.GetSubstructMatches(mol_smarts)[0]
                highlight_list.append(matching_atoms)
                if not total_shared:
                    total_shared = set(matching_atoms)
                else:
                    total_shared = total_shared.intersection(set(matching_atoms))
                cmap_index += 1
            print(total_shared)
            print(len(highlight_list), len(mols))
            display(SVG(Chem.Draw._MolsToGridSVG(mols, highlightAtomLists=highlight_list)))
            break