In [21]:
import pandas as pd
import tmap as tmap
from faerun import Faerun

from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors
from matplotlib.colors import ListedColormap
from rdkit.Chem.Descriptors import ExactMolWt

import pandas as pd
import numpy as np
from tqdm import tqdm

In [22]:
dataset = pd.read_excel("../../data/raw_data/all_raw_data_pan_HDAC.xlsx")

In [23]:
print(len(dataset))
dataset.head()

2179


Unnamed: 0,MolRegNo,Value,Unit,Type,SMILES,IS_LABELED
0,236556,1556.0,nM,IC50,[O-][N+](c(cc1)ccc1C(NCCCCCC(NO)=O)=O)=O,False
1,1352187,12.6,nM,IC50,[O-][N+](c(cc1)ccc1S([n](cc1)c2c1cc(/C=C/C(NO)...,True
2,418270,1600.0,nM,IC50,[O-][N+](c(cc1)ccc1S(N(CC1)CCN1c1ncc(/C=C/C(NO...,False
3,418245,400.0,nM,IC50,[O-][N+](c(cc1)ccc1S(N(CC1)CCN1c1ncc(C(NO)=O)[...,False
4,2163995,40.0,nM,IC50,[O-][N+](c(cccc1)c1-c1c[s]c(NC(CCCCCCC(NO)=O)=...,False


In [24]:
dataset.head()

Unnamed: 0,MolRegNo,Value,Unit,Type,SMILES,IS_LABELED
0,236556,1556.0,nM,IC50,[O-][N+](c(cc1)ccc1C(NCCCCCC(NO)=O)=O)=O,False
1,1352187,12.6,nM,IC50,[O-][N+](c(cc1)ccc1S([n](cc1)c2c1cc(/C=C/C(NO)...,True
2,418270,1600.0,nM,IC50,[O-][N+](c(cc1)ccc1S(N(CC1)CCN1c1ncc(/C=C/C(NO...,False
3,418245,400.0,nM,IC50,[O-][N+](c(cc1)ccc1S(N(CC1)CCN1c1ncc(C(NO)=O)[...,False
4,2163995,40.0,nM,IC50,[O-][N+](c(cccc1)c1-c1c[s]c(NC(CCCCCCC(NO)=O)=...,False


In [25]:
def tmap_maccs_fpts(data):
    Maccs_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
            except:
                print("An exception occurred with " + str(count))
                continue
            fpts = MACCSkeys.GenMACCSKeys(mol)
            mfpts = np.array(fpts)
            mfpts = tmap.VectorUint(mfpts)
            Maccs_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Maccs_fpts)

#Morgan2
def tmap_morgan_fpts(data):
    Morgan_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
            except:
                print("An exception occurred with " + str(count))
                continue
            fpts = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024)
            mfpts = np.array(fpts)
            mfpts = tmap.VectorUint(mfpts)
            Morgan_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return Morgan_fpts

In [26]:
fps = tmap_maccs_fpts(dataset["SMILES"])

Progress:   0%|          | 0/2179 [00:00<?, ?it/s][10:12:50] Conflicting single bond directions around double bond at index 15.
[10:12:50]   BondStereo set to STEREONONE and single bond directions set to NONE.
[10:12:50] Conflicting single bond directions around double bond at index 27.
[10:12:50]   BondStereo set to STEREONONE and single bond directions set to NONE.
[10:12:50] Conflicting single bond directions around double bond at index 26.
[10:12:50]   BondStereo set to STEREONONE and single bond directions set to NONE.
[10:12:50] Conflicting single bond directions around double bond at index 7.
[10:12:50]   BondStereo set to STEREONONE and single bond directions set to NONE.
[10:12:50] Conflicting single bond directions around double bond at index 7.
[10:12:50]   BondStereo set to STEREONONE and single bond directions set to NONE.
[10:12:50] Conflicting single bond directions around double bond at index 8.
[10:12:50]   BondStereo set to STEREONONE and single bond directions set to

# Generate Tmap Layout

Generate labels

In [27]:
ic50 = []
is_labeled = []
molecular_weight = []
labels = []
pubchem_url = "https://pubchem.ncbi.nlm.nih.gov/compound/"

for i, row in dataset.iterrows():
    smiles = row['SMILES']
    mol = AllChem.MolFromSmiles(smiles)
    cid = str(row['MolRegNo'])
    if cid[-1].isalpha():
        cid = cid[:-1]
    labels.append(
            f'{smiles}__{smiles}'.replace(
                "'", ""
            )
        )
    ic50.append(row['Value'])
    is_labeled.append(row["IS_LABELED"])
    #Molecular weight
    smiles_mw = ExactMolWt(mol)
    molecular_weight.append(smiles_mw)

[10:12:52] Conflicting single bond directions around double bond at index 15.
[10:12:52]   BondStereo set to STEREONONE and single bond directions set to NONE.
[10:12:52] Conflicting single bond directions around double bond at index 27.
[10:12:52]   BondStereo set to STEREONONE and single bond directions set to NONE.
[10:12:52] Conflicting single bond directions around double bond at index 26.
[10:12:52]   BondStereo set to STEREONONE and single bond directions set to NONE.
[10:12:52] Conflicting single bond directions around double bond at index 7.
[10:12:52]   BondStereo set to STEREONONE and single bond directions set to NONE.
[10:12:52] Conflicting single bond directions around double bond at index 7.
[10:12:52]   BondStereo set to STEREONONE and single bond directions set to NONE.
[10:12:52] Conflicting single bond directions around double bond at index 8.
[10:12:52]   BondStereo set to STEREONONE and single bond directions set to NONE.
[10:12:52] Conflicting single bond directio

In [28]:
print(len(molecular_weight), len(ic50), len(is_labeled), len(labels))

2179 2179 2179 2179


__Preprocessing the groups labels__

In [29]:
import scipy.stats as ss
# Create the labels and the integer encoded array for the groups,
# as they're categorical
is_labeled_labels_groups, is_labeled_groups = Faerun.create_categories(is_labeled)
#scale IC50
ic50_ranked = ss.rankdata(np.array(ic50) / max(ic50)) / len(ic50)
mw_ranked = ss.rankdata(np.array(molecular_weight) / max(molecular_weight)) / len(molecular_weight)

In [30]:
is_labeled_labels_groups

[(0, False), (1, True)]

In [35]:
is_labeled_labels_groups = [(0, "Chua loc"), (1, "Da loc")]
is_labeled_labels_groups

[(0, 'Chua loc'), (1, 'Da loc')]

In [36]:
bits = 1024
k = 100
enc = tmap.Minhash(bits)
lf = tmap.LSHForest(bits, 128)
lf.batch_add(enc.batch_from_binary_array(fps))
lf.index()
cfg = tmap.LayoutConfiguration()
cfg.k = k
cfg.sl_repeats = 2
cfg.mmm_repeats = 2
cfg.node_size = 2
x, y, s, t, _ = tmap.layout_from_lsh_forest(lf, config=cfg)

In [37]:
len(x)

2179

In [40]:
faerun = Faerun(view="front", clear_color="#e3d8c3", coords=False)
custom_cmap = ListedColormap(
    ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", 
     "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf", 
     "#000080", "#fcec03"],
    name="custom",
)
faerun.add_scatter(
        "chembl",
        {"x": x, 
         "y": y, 
         "c": [is_labeled_groups, ic50_ranked, mw_ranked], 
         "labels": labels},
        colormap=[custom_cmap, "viridis", "viridis"],
        point_scale=4.5,
        max_point_size=10,
        has_legend=True,
        categorical=[True, False, False],
        shader="smoothCircle",
        legend_labels=[is_labeled_labels_groups],
        selected_labels=["SMILES", "MolRegNo", "Name"],
        series_title=["Da loc tay", "IC50 (nM)", "Molecular Weight"],
        max_legend_label=[None, str(round(max(ic50))), str(round(max(molecular_weight)))],
        min_legend_label=[None, str(round(min(ic50))), str(round(min(molecular_weight)))],
        title_index=2,
        legend_title=""
    )
faerun.add_tree(
    "pubchem_tree", {"from": s, "to": t}, point_helper="chembl", color="#222222"
)
# You may see a different Tmap from what us generated since the index of the data-points are randomly generated depending on the runtime environment.
# Howerver, the way your tmap branches connected will be similar to ours.
faerun.plot(file_name="../../results/data_visualization/tmap/all_raw_data", template="smiles")