In [59]:
import pandas as pd
import tmap as tmap
from faerun import Faerun

from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors
from matplotlib.colors import ListedColormap
from rdkit.Chem.Descriptors import ExactMolWt

import pandas as pd
import numpy as np

In [60]:
#Train test data
train_test_path = "../../data_for_modeling/train_test_data/new_HDAC2_train_test_data.xlsx"
train_dataset = pd.read_excel(train_test_path, sheet_name='train_dataset')
test_dataset = pd.read_excel(train_test_path, sheet_name='test_dataset')
validation_dataset = pd.read_excel(train_test_path, sheet_name='validation_dataset')

#Add data belonging
belong_col_name = "Belong_to"
train_dataset[belong_col_name] = "Training data"
test_dataset[belong_col_name] = "Testing data"
validation_dataset[belong_col_name] = "Validation data"

#Train_test_dataset = combine of all three
all_dataset = pd.concat([train_dataset, validation_dataset, test_dataset], axis=0)

#Getting the classifier
all_data_path = "../../data_for_modeling/raw_data/new_HDAC2_raw_data.xlsx"
zgb_classifer_df = pd.read_excel(all_data_path, sheet_name='zbg_classifier_id')

In [61]:
print(len(train_dataset), len(test_dataset), len(validation_dataset), len(all_dataset))

1966 421 422 2809


In [62]:
all_dataset.head()

Unnamed: 0,CID,SMILES,Categories,ZBG Classified,Unnamed: 4,Unnamed: 5,Unnamed: 6,Belong_to
0,71465855,Nc1ccc(cc1NC(=O)N1CCOCC1)-c1cccs1,active,21,,,,Training data
1,5281220,O=C1/C(=C/c2ccc(O)c(O)c2)Oc2cc(O)cc(O)c21,inactive,5,,,,Training data
2,162654159,CCC(=O)CCCCC[C@H](NC(=O)[C@H]1CC12CCN(C)CC2)c1...,inactive,5,15.0,,,Training data
3,44571332,O=C(NO)c1ccc(CN2C(=O)CNC(=O)[C@H]2Cc2cccs2)cc1,inactive,1,15.0,,,Training data
4,130361881,O=C(NO)c1ccc(CN2CCCc3ccccc32)cc1,inactive,1,15.0,,,Training data


In [63]:
zgb_classifer_df

Unnamed: 0,ZBG Classified,Name
0,1,Acid hydroxamic
1,2,N-substituted hydroxamic acid
2,3,Carboxylic acid
3,4,Benzamide
4,5,"Alkyl ketone, aryl ketone, epoxyketone"
5,6,Cyclic peptides and cyclic peptides
6,8,"Trifluoromethylketone, trifluoromethyloxadiazole"
7,9,Thiol
8,10,Boronic acid
9,11,Disulfide


In [64]:
zgb_dict = zgb_classifer_df.set_index('ZBG Classified')['Name'].to_dict()
zgb_dict

{1: 'Acid hydroxamic',
 2: 'N-substituted hydroxamic acid',
 3: 'Carboxylic acid',
 4: 'Benzamide',
 5: 'Alkyl ketone, aryl ketone, epoxyketone',
 6: 'Cyclic peptides and cyclic peptides',
 8: 'Trifluoromethylketone, trifluoromethyloxadiazole',
 9: 'Thiol',
 10: 'Boronic acid',
 11: 'Disulfide',
 12: 'Amino acid derivative',
 13: 'Tropolone',
 14: '3-hydroxypyridin-2-thione',
 15: 'Carboxamide',
 16: 'Benzoyl-hydrazide',
 17: 'Hydroxypyrimidine',
 18: 'Hydroxyureas',
 19: 'Chalcone',
 20: 'Aniline-benzamid',
 21: 'Others'}

In [65]:
#MACCS
from tqdm import tqdm

def tmap_maccs_fpts(data):
    Maccs_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
            except:
                print("An exception occurred with " + str(count))
                continue
            fpts = MACCSkeys.GenMACCSKeys(mol)
            mfpts = np.array(fpts)
            mfpts = tmap.VectorUint(mfpts)
            Maccs_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Maccs_fpts)

#Morgan2
def tmap_morgan_fpts(data):
    Morgan_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
            except:
                print("An exception occurred with " + str(count))
                continue
            fpts = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024)
            mfpts = np.array(fpts)
            mfpts = tmap.VectorUint(mfpts)
            Morgan_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return Morgan_fpts

def ecfp4_tmap_fpts(data):
    all_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
                fpts = AllChem.GetMorganFingerprintAsBitVect(mol=mol, radius=2, nBits=2048)
            except:
                print("An exception occurred with " + str(count))
                continue
            mfpts = np.array(fpts)
            mfpts = tmap.VectorUint(mfpts)
            all_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(all_fpts)

In [66]:
# fps = tmap_maccs_fpts(all_dataset["SMILES"])
fps = ecfp4_tmap_fpts(all_dataset["SMILES"])

Progress:   0%|          | 0/2809 [00:00<?, ?it/s]

Progress: 100%|██████████| 2809/2809 [00:01<00:00, 1545.77it/s]


# Generate Tmap Layout

Generate labels

In [67]:
# ic50 = []
labels = []
belong_to_groups = []
zbg_classifier_groups = []
active_inactive_groups = []
molecular_weight = []
chembl_url = "https://www.ebi.ac.uk/chembl/compound_report_card/"
pubchem_url = "https://pubchem.ncbi.nlm.nih.gov/compound/"

for i, row in all_dataset.iterrows():
    smiles = row['SMILES']
    mol = AllChem.MolFromSmiles(smiles)
    cid = str(row['CID'])
    if cid[-1].isalpha():
        cid = cid[:-1]
    
    labels.append(
            f'{smiles}__<a href="{pubchem_url+str(cid)}" target="_blank">{smiles}</a>__{smiles}'.replace(
                "'", ""
            )
        )
    # ic50.append(row['AVG_IC50_nM'])
    # zbg labels groups
    zbg_classifier_groups.append(zgb_dict[row['ZBG Classified']])
    #Active inactive label groups
    active_inactive_groups.append(row['Categories'])
    #Belong to groups
    belong_to_groups.append(row[belong_col_name])
    #Molecular weight
    smiles_mw = ExactMolWt(mol)
    molecular_weight.append(smiles_mw)

In [68]:
# print(len(belong_to_groups), len(ic50), len(zbg_classifier_groups), len(active_inactive_groups), len(labels))
print(len(belong_to_groups), len(zbg_classifier_groups), len(active_inactive_groups), len(labels))

2809 2809 2809 2809


__Preprocessing the groups labels__

In [69]:
import scipy.stats as ss
# Create the labels and the integer encoded array for the groups,
# as they're categorical
zgb_labels_groups, zgb_groups = Faerun.create_categories(zbg_classifier_groups)
activity_labels_groups, activity_groups = Faerun.create_categories(active_inactive_groups)
belong_labels_groups, belong_groups = Faerun.create_categories(belong_to_groups)
#scale IC50
# ic50_ranked = ss.rankdata(np.array(ic50) / max(ic50)) / len(ic50)
mw_ranked = ss.rankdata(np.array(molecular_weight) / max(molecular_weight)) / len(molecular_weight)

In [70]:
belong_labels_groups

[(0, 'Testing data'), (1, 'Training data'), (2, 'Validation data')]

In [71]:
bits = 1024
k = 300
enc = tmap.Minhash(bits)
lf = tmap.LSHForest(bits, 128)
lf.batch_add(enc.batch_from_binary_array(fps))
lf.index()
cfg = tmap.LayoutConfiguration()
cfg.k = k
cfg.sl_repeats = 2
cfg.mmm_repeats = 2
cfg.node_size = 2
x, y, s, t, _ = tmap.layout_from_lsh_forest(lf, config=cfg)

In [72]:
len(x)

2809

In [73]:
faerun = Faerun(view="front", clear_color="#e3d8c3", coords=False)
custom_cmap = ListedColormap(
    ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", 
     "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf", 
     "#000080", "#fcec03"],
    name="custom",
)
faerun.add_scatter(
        "chembl",
        {"x": x, 
         "y": y, 
         "c": [zgb_groups, belong_groups, activity_groups, mw_ranked], 
         "labels": labels},
        colormap=[custom_cmap, custom_cmap, custom_cmap, "viridis"],
        point_scale=4.5,
        max_point_size=10,
        has_legend=True,
        categorical=[True, True, True, False],
        shader="smoothCircle",
        legend_labels=[zgb_labels_groups, belong_labels_groups, activity_labels_groups],
        selected_labels=["SMILES", "PubChem CID", "Name"],
        series_title=["ZGB Classifier group", "Dataset label group", "Activity label group", "Molecular Weight"],
        max_legend_label=[None, None, None, str(round(max(molecular_weight)))],
        min_legend_label=[None, None, None, str(round(min(molecular_weight)))],
        title_index=2,
        legend_title=""
    )
faerun.add_tree(
    "pubchem_tree", {"from": s, "to": t}, point_helper="chembl", color="#222222"
)
# You may see a different Tmap from what us generated since the index of the data-points are randomly generated depending on the runtime environment.
# Howerver, the way your tmap branches connected will be similar to ours.
faerun.plot(file_name="../../results/tmap/ecfp4_tmap_hdac2.html", template="smiles")

In [74]:
# faerun = Faerun(view="front", clear_color="#e3d8c3", coords=False)
# custom_cmap = ListedColormap(
#     ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", 
#      "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf", 
#      "#000080", "#fcec03"],
#     name="custom",
# )
# faerun.add_scatter(
#         "chembl",
#         {"x": x, 
#          "y": y, 
#          "c": [zgb_groups, belong_groups, activity_groups, ic50_ranked, mw_ranked], 
#          "labels": labels},
#         colormap=[custom_cmap, custom_cmap, custom_cmap, "viridis", "viridis"],
#         point_scale=4.5,
#         max_point_size=10,
#         has_legend=True,
#         categorical=[True, True, True, False, False],
#         shader="smoothCircle",
#         legend_labels=[zgb_labels_groups, belong_labels_groups, activity_labels_groups],
#         selected_labels=["SMILES", "PubChem CID", "Name"],
#         series_title=["ZGB Classifier group", "Dataset label group", "Activity label group", "IC50 (nM)", "Molecular Weight"],
#         max_legend_label=[None, None, None, str(round(max(ic50))), str(round(max(molecular_weight)))],
#         min_legend_label=[None, None, None, str(round(min(ic50))), str(round(min(molecular_weight)))],
#         title_index=2,
#         legend_title=""
#     )
# faerun.add_tree(
#     "pubchem_tree", {"from": s, "to": t}, point_helper="chembl", color="#222222"
# )
# # You may see a different Tmap from what us generated since the index of the data-points are randomly generated depending on the runtime environment.
# # Howerver, the way your tmap branches connected will be similar to ours.
# faerun.plot(file_name="../../results/tmap/ecfp4_tmap_hdac2.html", template="smiles")