<a href="https://colab.research.google.com/github/LGLV-Ciencia-de-Datos/Quimioinformatica_LGLV/blob/main/6_Espacio_Qu%C3%ADmico_tSNE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Incrustación de vecinos estocásticos distribuidos en t (t-SNE)

---
Realizó: Ana Chávez, Fernanda Saldívar, Armando Rufino y Hector Ortíz

Contacto: anachavez3026@gmail.com, fer.saldivarg@gmail.com

In [4]:
from IPython.utils import io
import tqdm.notebook
import os, sys, random
total = 100
with tqdm.notebook.tqdm(total=total) as pbar:
    with io.capture_output() as captured:
      # Instalar rdkit
      !pip -q install rdkit.pypi==2021.9.4
      pbar.update(25)
      # Instalar molplotly
      !pip install molplotly
      pbar.update(50)
      # Instalar jupyter-dash
      !pip install jupyter-dash
      pbar.update(75)
      # Instalar el diseño de aplicación dash
      !pip install dash-bootstrap-components
      pbar.update(100)

  0%|          | 0/100 [00:00<?, ?it/s]

In [6]:
# Importar blibliotecas
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys, AllChem
from scipy.spatial.distance import pdist

In [None]:
# Leer bases de datos

#BIOFACQUIM
url_biofacquim = "https://raw.githubusercontent.com/DIFACQUIM/Cursos/main/Datasets/BIOFACQUIM.V2_curada.csv"
BIOFACQUIM = pd.read_csv(url_biofacquim)

#FDA
url_fda = "https://raw.githubusercontent.com/DIFACQUIM/Cursos/main/Datasets/FDA_2022_july_05_curada.csv"
FDA = pd.read_csv(url_fda)

#DNMT1
url_dnmt1 = "https://raw.githubusercontent.com/DIFACQUIM/Cursos/main/Datasets/DNMT1_curada.csv"
DNMT1 = pd.read_csv(url_dnmt1)
DNMT1.head(2)

Unnamed: 0,ID,SMILES,Data set
0,"""CHEMBL2336409",Cc1cc(=Nc2ccc(NC(=O)c3ccc(N=c4cc[nH]c5ccccc45)...,DNMT1
1,"""CHEMBL1361703",COc1ccccc1CNC(=O)COC(=O)c1cc(-c2ccco2)nc2ccccc12,DNMT1


In [None]:
# Ver columnas
print(FDA.columns)
print(BIOFACQUIM.columns)
print(DNMT1.columns)

Index(['ID', 'SMILES', 'NEW_SMILES', 'Data set'], dtype='object')
Index(['ID', 'SMILES', 'Data set'], dtype='object')
Index(['ID', 'SMILES', 'Data set'], dtype='object')


In [None]:
# Seleccionar columnas
FDA = FDA[['ID', 'NEW_SMILES', "Data set"]]
BIOFACQUIM = BIOFACQUIM[['ID', 'SMILES', "Data set"]]
DNMT1 = DNMT1[['ID', 'SMILES', "Data set"]]
# Cambiar nombre a columnas
FDA.columns = ["ID", "SMILES", "Data set"]
BIOFACQUIM.columns = ['ID',  'SMILES', "Data set"]
DNMT1.columns = ["ID", "SMILES", "Data set"]
FDA.head(2)

Unnamed: 0,ID,SMILES,Data set
0,DB00006,CCC(C)C(NC(=O)C(CCC(=O)O)NC(=O)C(CCC(=O)O)NC(=...,FDA
1,DB00007,CCNC(=O)C1CCCN1C(=O)C(CCCN=C(N)N)NC(=O)C(CC(C)...,FDA


In [None]:
# Unir (concatenar) bases de datos
data = pd.concat([FDA, BIOFACQUIM, DNMT1], axis=0).reset_index(drop=True)

In [None]:
# Generar descriptores
# Importar bibliotecas
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors

#Calcular descriptores moleculares
data["HBA"] = [Descriptors.NumHAcceptors(y) for y in (Chem.MolFromSmiles(x) for x in data["SMILES"])]
data["HBD"] = [Descriptors.NumHDonors(y) for y in (Chem.MolFromSmiles(x) for x in data["SMILES"])]
data["RB"] = [Descriptors.NumRotatableBonds(y) for y in (Chem.MolFromSmiles(x) for x in data["SMILES"])]
data["LogP"] = [Descriptors.MolLogP(y) for y in (Chem.MolFromSmiles(x) for x in data["SMILES"])]
data["TPSA"] = [Descriptors.TPSA(y) for y in (Chem.MolFromSmiles(x) for x in data["SMILES"])]
data["MW"] = [Descriptors.MolWt(y) for y in (Chem.MolFromSmiles(x) for x in data["SMILES"])]
data.head(2)

Unnamed: 0,ID,SMILES,Data set,HBA,HBD,RB,LogP,TPSA,MW
0,DB00006,CCC(C)C(NC(=O)C(CCC(=O)O)NC(=O)C(CCC(=O)O)NC(=...,FDA,29,27,66,-8.3261,904.07,2180.317
1,DB00007,CCNC(=O)C1CCCN1C(=O)C(CCCN=C(N)N)NC(=O)C(CC(C)...,FDA,14,15,32,-1.4381,431.54,1209.421


In [None]:
# Entrenar modelo t-SNE
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
data_tsne = data.copy()
data_tsne = data_tsne.drop(labels = ["Data set", "ID","SMILES"],axis = 1)
data_tsne = StandardScaler().fit_transform(data_tsne)
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(data_tsne)
tsne_results

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 3231 samples in 0.003s...
[t-SNE] Computed neighbors for 3231 samples in 0.234s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3231
[t-SNE] Computed conditional probabilities for sample 2000 / 3231
[t-SNE] Computed conditional probabilities for sample 3000 / 3231
[t-SNE] Computed conditional probabilities for sample 3231 / 3231
[t-SNE] Mean sigma: 0.199656
[t-SNE] KL divergence after 250 iterations with early exaggeration: 71.133026
[t-SNE] KL divergence after 300 iterations: 1.785170


array([[11.041617  , -0.94086605],
       [10.157844  , -0.6406828 ],
       [10.319824  , -0.5992008 ],
       ...,
       [ 4.6085362 , -7.300657  ],
       [ 4.662191  , -6.9937224 ],
       [ 1.7237955 , -5.659888  ]], dtype=float32)

In [None]:
# Seleccionar impormación complementaria
label = data[["Data set", "ID", "SMILES"]]
label = label.to_numpy()
label.shape

(3231, 3)

In [None]:
# Concatenar arrays de numpy
arr = np.concatenate((label, tsne_results), axis = 1)
arr.shape

(3231, 5)

In [None]:
# Crear un nuevo dataframe
tsne_dataset = pd.DataFrame(data=arr, columns = ['Data set',"ID", "SMILES",'axis 1', 'axis 2'] )
tsne_dataset.head(2)

Unnamed: 0,Data set,ID,SMILES,axis 1,axis 2
0,FDA,DB00006,CCC(C)C(NC(=O)C(CCC(=O)O)NC(=O)C(CCC(=O)O)NC(=...,11.041617,-0.940866
1,FDA,DB00007,CCNC(=O)C1CCCN1C(=O)C(CCCN=C(N)N)NC(=O)C(CC(C)...,10.157844,-0.640683


In [None]:
# Graficar
import plotly.express as px
import molplotly
fig_tsne = px.scatter(tsne_dataset,
                            x='axis 1',
                            y='axis 2',
                            #symbol='Minimum Degree',
                            color='Data set',
                            color_discrete_sequence=["indigo", "green", 'orange',],
                            title='t-SNE',
                            labels={'Axis 1': 'axis 1',
                                    'Axis 2': 'axis 2'},
                            width=600,
                            height=500)
app_marker = molplotly.add_molecules(fig=fig_tsne,
                                         df=tsne_dataset,
                                         smiles_col='SMILES',
                                         title_col='ID',
                                         color_col='Data set'
                                        )

#fig_tsne.show()
#app_marker.run_server(mode='inline', port=8060, height=1000)
app_marker.run(port=8060)




JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.



<IPython.core.display.Javascript object>

In [None]:
#==============================================================================#

## Para saber más:
* Medina-Franco JL, Sánchez-Cruz N, López-López E, Díaz-Eufracio BI. Progress on open chemoinformatic tools for expanding and exploring the chemical space. J Comput Aided Mol Des. 2022, 36, 341-354. DOI: [10.1007/s10822-021-00399-1](https://link.springer.com/article/10.1007/s10822-021-00399-1).
* Medina-Franco JL, Chávez-Hernández AL, López-López E, Saldívar-González FI. Chemical Multiverse: An Expanded View of Chemical Space. Mol Inform. 2022, 41, e2200116. DOI: [10.1002/minf.202200116](https://onlinelibrary.wiley.com/doi/10.1002/minf.202200116).
* Greener JG, Kandathil SM, Moffat L, Jones DT. A guide to machine learning for biologists. Nat Rev Mol Cell Biol. 2022, 23, 40-55. DOI:[10.1038/s41580-021-00407-0](https://www.nature.com/articles/s41580-021-00407-0).
* Bender A, Schneider N, Segler M, Patrick Walters W, Engkvist O, Rodrigues T. Evaluation guidelines for machine learning tools in the chemical sciences. Nat Rev Chem. 2022, 6, 428-442. DOI: [10.1038/s41570-022-00391-9](https://www.nature.com/articles/s41570-022-00391-9).