In [59]:
import numpy as np
import sklearn
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from IPython.display import SVG, IFrame
import gzip
import os
import pickle
import pandas as pd
import kmapper as km
from kmapper import jupyter
from sklearn import cluster

In [60]:
with open("../data/processed/curated_set_with_publication_year.pd.pkl", "rb") as infile:
    df = pickle.load(infile)

In [61]:
first_target = df["TGT_CHEMBL_ID"] == "CHEMBL209"
sub_df = df[first_target]

In [62]:
fingerprint_data = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles),2) for smiles in sub_df["SMILES"]]
try:
    sub_df.insert(0, "FINGERPRINT",fingerprint_data)
except ValueError:
    sub_df.loc["FINGERPRINT"] = fingerprint_data

In [63]:
sub_df

Unnamed: 0,FINGERPRINT,BIOACT_PCHEMBL_VALUE,CMP_ACD_LOGD,CMP_ACD_LOGP,CMP_ALOGP,CMP_AROMATIC_RINGS,CMP_CHEMBL_ID,CMP_FULL_MWT,CMP_HBA,CMP_HBD,...,CMP_STANDARD_INCHI_KEY,CMP_STRUCTURE_TYPE,CMP_TYPE_PROTEIN,CMP_TYPE_SMALL_MOLECULE,DOC_YEAR,SMILES,TC_key,TGT_CHEMBL_ID,TGT_ORGANISM,TGT_TID
CHEMBL209 - CHEMBL19831,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7.78,-2.86,0.58,-0.99,1,CHEMBL19831,568.58,11,5,...,OECPVFSZEWVKDZ-GOSISDBHSA-N,MOL,False,True,2002,COC(=O)c1ccc(F)cc1CS(=O)(=O)Nc1ccc(C)n(CC(=O)N...,CHEMBL209 - CHEMBL19831,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL332157,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8.19,0.05,2.00,1.19,4,CHEMBL332157,462.48,7,4,...,RDSVRCGNSLPORJ-UHFFFAOYSA-N,MOL,False,True,2003,N=C(N)c1cccc(-n2nnnc2C(=O)Nc2ccc(-c3ccccc3S(N)...,CHEMBL209 - CHEMBL332157,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL52427,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6.89,1.72,3.72,4.75,4,CHEMBL52427,341.41,2,4,...,ALBWLRSLXFJFTE-UHFFFAOYSA-N,MOL,False,True,2001,Cc1cc(-c2ccccc2)c(O)c(-c2cc3cc(C(=N)N)ccc3[nH]...,CHEMBL209 - CHEMBL52427,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL353213,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5.17,1.74,1.94,0.50,3,CHEMBL353213,430.50,5,2,...,YTPXTRPVTXEMKM-UHFFFAOYSA-N,MOL,False,True,2003,Cc1cnc(NCCc2ccccc2)c(=O)n1CC(=O)NCc1ccc2nccn2c1C,CHEMBL209 - CHEMBL353213,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL100672,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7.39,1.62,1.62,2.11,0,CHEMBL100672,166.22,2,0,...,WWRICQDSYZBXSH-HLTSFMKQSA-N,MOL,False,True,1998,C=CC[C@H]1C(=O)O[C@H]2CCC[C@@H]21,CHEMBL209 - CHEMBL100672,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL99622,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9.92,2.24,2.24,3.07,1,CHEMBL99622,234.31,3,0,...,LHVRINJTNXVOIY-SDDRHHMPSA-N,MOL,False,True,1998,O=C1O[C@H]2CCC[C@@H]2[C@H]1Sc1ccccc1,CHEMBL209 - CHEMBL99622,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL256892,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",8.09,-1.40,0.60,-0.29,2,CHEMBL256892,446.52,6,5,...,VXDAVYUFYPFGDX-SNPRPXQTSA-N,MOL,False,True,2004,CC(=O)N1C[C@H](O)C[C@H]1C(=O)N[C@@H](CCCNC(=N)...,CHEMBL209 - CHEMBL256892,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL101041,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9.30,1.15,1.15,2.41,0,CHEMBL101041,222.28,3,0,...,FRPFDONKAHIRLY-WOPDTQHZSA-N,MOL,False,True,1998,CC(C)=CC(=O)C[C@@H]1C(=O)O[C@H]2CCC[C@@H]21,CHEMBL209 - CHEMBL101041,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL168411,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.73,2.43,2.43,2.99,4,CHEMBL168411,619.71,6,3,...,AWCMDTMSPRXSSZ-RWPDHJIBSA-N,MOL,False,True,1999,Cn1c(C(=O)[C@H](Cc2cccc(C(=N)N)c2)NC(=O)[C@@H]...,CHEMBL209 - CHEMBL168411,CHEMBL209,Homo sapiens,42
CHEMBL209 - CHEMBL47207,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",6.77,-1.51,-0.51,3.50,3,CHEMBL47207,445.53,4,3,...,GJEPQTOOGGNJLG-UZUQRXQVSA-O,MOL,False,True,2002,COC(=O)[C@H](Cc1cccc(C(=N)N)c1)[C@@H](C)NC(=O)...,CHEMBL209 - CHEMBL47207,CHEMBL209,Homo sapiens,42


In [76]:
fingerprint_data = []
for index, series in sub_df.iterrows():
    fingerprint_data.append((series["CMP_CHEMBL_ID"], series["FINGERPRINT"]))
len(fingerprint_data)


80

In [70]:
def GetDistanceMatrix(data,metric,isSimilarity=1):
    """
    Adapted from rdkit, because their implementation has a bug
    in it (it relies on Python 2 doing integer division by default).
    It is also poorly documented. Metric is a function
    that returns the 'distance' between points 1 and 2.
    
    Data should be a list of tuples with fingerprints in position 1
    (the rest of the elements of the tuple are not important)

    Returns the symmetric distance matrix.
    (see ML.Cluster.Resemblance for layout documentation)
    """
    nPts = len(data)
    num_pairs = int(nPts*(nPts-1)/2)
    res = np.zeros(num_pairs ,np.float)
    print(res)
    nSoFar=0
    for col in range(1,nPts):
        for row in range(col):
            fp1 = data[col][1]
            fp2 = data[row][1]
            if fp1.GetNumBits()>fp2.GetNumBits():
                fp1 = DataStructs.FoldFingerprint(fp1,fp1.GetNumBits()/fp2.GetNumBits())
            elif fp2.GetNumBits()>fp1.GetNumBits():
                fp2 = DataStructs.FoldFingerprint(fp2,fp2.GetNumBits()/fp1.GetNumBits())
            sim = metric(fp1,fp2)
            if isSimilarity:
                sim = 1.-sim
            res[nSoFar] = sim
            nSoFar += 1
    return res   

In [89]:
import rdkit.Chem.Fingerprints.ClusterMols
distance_matrix = GetDistanceMatrix(fingerprint_data, metric=rdkit.DataStructs.DiceSimilarity)
distance_matrix

[0. 0. 0. ... 0. 0. 0.]


array([0.76377953, 0.75221239, 0.62      , ..., 0.70454545, 0.79439252,
       0.06382979])

In [94]:
# Now we need to mangle this flat distance matrix into a sane square one.
# The indices of (row, col) are at (col*(col-1))/2 + row
# in the flat matrix.

sq_distance_matrix = np.empty([len(fingerprint_data), len(fingerprint_data)])
for row in range(len(fingerprint_data)):
    for col in range(row + 1):
        index = int((col * (col - 1)) / 2) + row
        if row == col:
            sq_distance_matrix[row, col] = 0.0
        else:
            sq_distance_matrix[row, col] = distance_matrix[index]
            sq_distance_matrix[col, row] = distance_matrix[index]
pd.DataFrame(sq_distance_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
0,0.000000,0.752212,0.620000,0.676923,0.794872,0.766990,0.919192,0.953488,0.972222,0.887640,...,0.914894,0.833333,0.716535,0.860000,0.638889,0.579832,0.696970,0.557377,0.720930,0.603448
1,0.752212,0.000000,0.620000,0.676923,0.794872,0.766990,0.919192,0.953488,0.972222,0.887640,...,0.914894,0.833333,0.716535,0.860000,0.638889,0.579832,0.696970,0.557377,0.720930,0.603448
2,0.620000,0.620000,0.000000,0.794872,0.766990,0.919192,0.953488,0.972222,0.887640,0.881188,...,0.833333,0.716535,0.860000,0.638889,0.579832,0.696970,0.557377,0.720930,0.603448,0.627451
3,0.676923,0.676923,0.794872,0.000000,0.953488,0.972222,0.887640,0.881188,0.863636,0.837838,...,0.860000,0.638889,0.579832,0.696970,0.557377,0.720930,0.603448,0.627451,0.731092,0.931818
4,0.794872,0.794872,0.766990,0.953488,0.000000,0.863636,0.837838,0.802198,0.433333,0.696970,...,0.696970,0.557377,0.720930,0.603448,0.627451,0.731092,0.931818,0.888889,0.702479,0.872340
5,0.766990,0.766990,0.919192,0.972222,0.863636,0.000000,0.714286,0.752381,0.721311,0.890110,...,0.627451,0.731092,0.931818,0.888889,0.702479,0.872340,0.565217,0.185841,0.714286,0.206897
6,0.919192,0.919192,0.953488,0.887640,0.837838,0.714286,0.000000,0.913043,0.923077,0.831579,...,0.872340,0.565217,0.185841,0.714286,0.206897,0.580645,0.140741,0.721311,0.703704,0.632000
7,0.953488,0.953488,0.972222,0.881188,0.802198,0.752381,0.913043,0.000000,0.647059,0.688525,...,0.140741,0.721311,0.703704,0.632000,0.914894,0.833333,0.653543,0.860000,0.666667,0.697479
8,0.972222,0.972222,0.887640,0.863636,0.433333,0.721311,0.923077,0.647059,0.000000,0.585586,...,0.860000,0.666667,0.697479,0.666667,0.704918,0.707692,0.725806,0.862745,0.865169,0.840000
9,0.887640,0.887640,0.881188,0.837838,0.696970,0.890110,0.831579,0.688525,0.585586,0.000000,...,0.865169,0.840000,0.739130,0.311475,0.269841,0.808511,0.373134,0.747748,0.813953,0.717172


In [125]:
numerical_cols = [sub_df.columns[pos] for pos, item in enumerate(sub_df.dtypes) if item in [np.float64, np.int64]]
new_data = sub_df[numerical_cols].to_numpy()
data_shape = [80]
data_shape
dimensional_data = np.empty(data_shape)
for i, row in enumerate(new_data):
    dimensional_data[i] = row[0]
print(dimensional_data)
mapper = km.KeplerMapper(verbose=1)
graph = mapper.map(dimensional_data, X=sq_distance_matrix, precomputed=True, nr_cubes=4, clusterer=sklearn.cluster.DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='precomputed', metric_params=None, min_samples=3, n_jobs=None, p=None))

[ 7.78  8.19  6.89  5.17  7.39  9.92  8.09  9.3   4.73  6.77  7.85  7.44
  7.1   6.77  7.91  8.7   7.59  6.4   7.73  7.87  7.12  7.28  6.68  6.52
  8.36  7.5   4.    5.48  6.89  5.92 10.7   5.96  5.7   7.44  7.52  6.31
  7.92  6.66  5.48  5.47  4.52  7.71  5.19  5.7   5.07  7.68  6.23  6.8
  7.48  4.56  5.77  7.62  7.22  6.38  6.24  6.89  7.3   6.01  8.7   7.16
  6.15  7.16  7.92  6.89  5.1   5.22  6.43  6.54  6.    7.31  7.12  7.4
  9.22  7.29  5.7   7.15  5.05  7.4   7.36  8.08]
KeplerMapper(verbose=1)
Mapping on data shaped (80, 80) using lens shaped (80,)

Creating 4 hypercubes.

Created 3 edges and 4 nodes in 0:00:00.013224.


In [126]:
# Visualize it
mapper.visualize(graph, path_html="map-dataframe-test.html",
                 title="Map Dataframe Test")
IFrame("map-dataframe-test.html", 800, 600)

Wrote visualization to: map-dataframe-test.html
