Import the library

In [1]:
import pandas as pd
import tmap as tmap
from faerun import Faerun

from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors
from matplotlib.colors import ListedColormap
from rdkit.Chem.Descriptors import ExactMolWt

import pandas as pd
import numpy as np

Import the dataset

In [2]:
import pandas as pd

# URL của file trên GitHub
url = 'https://raw.githubusercontent.com/LeTue3004/Ai_duoc/main/Preprocessing_IDO1_Substructure%26IC50.csv'

# Đọc file CSV từ URL và lưu dữ liệu vào DataFrame
data = pd.read_csv(url)

# Xem nội dung của DataFrame
print(data.head())

   STT   CID                                             SMILES  IC50 (uM)  \
0    1  2378  C1=CC=C(C=C1)C2=CC=C(C=C2)C(C3=CC=CC=C3)N4C=CN=C4       23.0   
1    2  3198  C1=CC(=CC=C1COC(CN2C=CN=C2)C3=C(C=C(C=C3)Cl)Cl)Cl        4.6   
2    3  3675                                  C1=CC=C(C=C1)CCNN       14.0   
3    4  3760  C1=CC(=C(C(=C1)Cl)COC(CN2C=CN=C2)C3=C(C=C(C=C3...       23.0   
4    5  3823  CC(=O)N1CCN(CC1)C2=CC=C(C=C2)OCC3COC(O3)(CN4C=...       32.0   

  Substructure 1,3-diphenylurea 1H-indazole Beta-carboline Dihydropyridine  \
0    Imidazole              NaN         NaN            NaN             NaN   
1    Imidazole              NaN         NaN            NaN             NaN   
2        Other              NaN         NaN            NaN             NaN   
3    Imidazole              NaN         NaN            NaN             NaN   
4    Imidazole              NaN         NaN            NaN             NaN   

  Furan  ... Thiohydantoin Thiophene Triazole Tryptophan 1H-in

mã hoá smiles

In [12]:
#MACCS
from tqdm import tqdm

def tmap_maccs_fpts(data):
    Maccs_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
            except:
                print("An exception occurred with " + str(count))
                continue
            fpts = MACCSkeys.GenMACCSKeys(mol)
            mfpts = np.array(fpts)
            mfpts = tmap.VectorUint(mfpts)
            Maccs_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Maccs_fpts)

mã hoá chuỗi smiles của tập IDO1

In [13]:
fps = tmap_maccs_fpts(data["SMILES"])

Progress: 100%|██████████| 3348/3348 [00:07<00:00, 441.93it/s]


GENERATE TMAP LAYOUT

In [14]:
ic50 = []
labels = []
molecular_weight = []
Substructure_group = []
chembl_url = "https://www.ebi.ac.uk/chembl/compound_report_card/"
pubchem_url = "https://pubchem.ncbi.nlm.nih.gov/compound/"

for i, row in data.iterrows():
    smiles = row['SMILES']
    mol = AllChem.MolFromSmiles(smiles)
    cid = str(row['CID'])
    if cid[-1].isalpha():
        cid = cid[:-1]
    
    labels.append(
            f'{smiles}__<a href="{pubchem_url+str(cid)}" target="_blank">{smiles}</a>__{smiles}'.replace(
                "'", ""
            )
        )
    ic50.append(row['IC50 (uM)'])
    #Molecular weight
    smiles_mw = ExactMolWt(mol)
    molecular_weight.append(smiles_mw)

Lưu labels các chất

In [19]:
Substructure_classifier_groups = data.iloc[:, 4:5]
Substructure_classifier_groups

Unnamed: 0,Substructure
0,Imidazole
1,Imidazole
2,Other
3,Imidazole
4,Imidazole
...,...
3343,1H-indazole
3344,1H-indazole
3345,1H-indazole
3346,1H-indazole


In [15]:
print(len(ic50),len(labels),len(molecular_weight))

3348 3348 3348


Preprocessing the groups labels

In [23]:
import scipy.stats as ss
# Create the labels and the integer encoded array for the groups,
# as they're categorical
Substructure_labels_groups, Substructure_groups = Faerun.create_categories(Substructure_classifier_groups)
#scale IC50
ic50_ranked = ss.rankdata(np.array(ic50) / max(ic50)) / len(ic50)
mw_ranked = ss.rankdata(np.array(molecular_weight) / max(molecular_weight)) / len(molecular_weight)


Layout the map

In [21]:
bits = 1024
k = 300
enc = tmap.Minhash(bits)
lf = tmap.LSHForest(bits, 128)
lf.batch_add(enc.batch_from_binary_array(fps))
lf.index()
cfg = tmap.LayoutConfiguration()
cfg.k = k
cfg.sl_repeats = 2
cfg.mmm_repeats = 2
cfg.node_size = 2
x, y, s, t, _ = tmap.layout_from_lsh_forest(lf, config=cfg)

In [22]:
len(x)

3348

Draw the map

In [25]:
faerun = Faerun(view="front", clear_color="#e3d8c3", coords=False)
custom_cmap = ListedColormap(
    ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", 
     "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf", 
     "#000080", "#fcec03"],
    name="custom",
)

faerun.add_scatter(
        "chembl",
        {"x": x, 
         "y": y, 
         "c": [Substructure_groups, ic50_ranked, mw_ranked], 
         "labels": labels},
        colormap=[ custom_cmap, "viridis", "viridis"],
        #colormap=[custom_cmap, custom_cmap, custom_cmap, "viridis", "viridis"],
        point_scale=4.5,
        max_point_size=10,
        has_legend=True,
        #categorical=[True, True, True, False, False],
        categorical=[True, False, False],
        shader="smoothCircle",
        #legend_labels=[zgb_labels_groups, belong_labels_groups, activity_labels_groups],
        legend_labels = [Substructure_labels_groups],
        selected_labels=["SMILES", "PubChem CID", "Name"],
        series_title=["IC50 (nM)", "Molecular Weight"],
        #max_legend_label=[None, None, None, str(round(max(ic50))), str(round(max(molecular_weight)))],
        max_legend_label=[None, str(round(max(ic50))), str(round(max(molecular_weight)))],
        #min_legend_label=[None, None, None, str(round(min(ic50))), str(round(min(molecular_weight)))],
        min_legend_label=[None, str(round(min(ic50))), str(round(min(molecular_weight)))],
        title_index=2,
        legend_title=""
    )
faerun.add_tree(
    "pubchem_tree", {"from": s, "to": t}, point_helper="chembl", color="#222222"
)
file_name = "/home/tuele/code/Tmap"
faerun.plot(file_name=file_name, template="smiles")