In [73]:
import pandas as pd

lit_cluster_heads = pd.read_csv("data/reaxys/cluster_heads.tsv", delimiter="\t")
lit_cluster_heads["source"] = "literature"
exp = pd.read_csv("data/abbvie-substrates.tsv", delimiter="\t")
exp["source"] = "experimental"
combined = pd.concat([lit_cluster_heads, exp], ignore_index=True)
combined["rxn_smiles"] = (
    combined["startingmat_1_smiles"] + "." + combined["startingmat_2_smiles"] + ">>" + combined["product_1_smiles"]
)

In [13]:
import numpy as np
from rdkit import Chem


def smi2numpyarr(smi: str) -> np.ndarray:
    """Generate a fingerprint numpy array from a SMILES string."""
    fpgen = Chem.rdFingerprintGenerator.GetMorganGenerator()

    mol = Chem.MolFromSmiles(smi)
    fp = fpgen.GetFingerprintAsNumPy(mol)
    return fp


fingerprints = np.array([smi2numpyarr(smi) for smi in combined["product_1_smiles"]])

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [28]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP

pca = PCA(n_components=8)
X_pca = pca.fit_transform(fingerprints)
X_pca_lit = X_pca[: len(lit_cluster_heads)]
X_pca_exp = X_pca[len(lit_cluster_heads) :]

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_pca)
X_tsne_lit = X_tsne[: len(lit_cluster_heads)]
X_tsne_exp = X_tsne[len(lit_cluster_heads) :]

umap = UMAP(n_components=8)
X_umap = umap.fit_transform(fingerprints)
X_umap_lit = X_umap[: len(lit_cluster_heads)]
X_umap_exp = X_umap[len(lit_cluster_heads) :]

In [74]:
pca_combined = combined.copy()
pca_combined["x"] = X_pca[:, 0]
pca_combined["y"] = X_pca[:, 1]
pca_combined["source"] = combined["source"]

tsne_combined = combined.copy()
tsne_combined["x"] = X_tsne[:, 0]
tsne_combined["y"] = X_tsne[:, 1]
tsne_combined["source"] = combined["source"]

umap_combined = combined.copy()
umap_combined["x"] = X_umap[:, 0]
umap_combined["y"] = X_umap[:, 1]
umap_combined["source"] = combined["source"]

In [82]:
import molplotly
import plotly.express as px

fig = px.scatter(
    pca_combined,
    x="x",
    y="y",
    color="source",
    title="PCA",
    width=650,
    height=650,
)

app = molplotly.add_molecules(
    fig=fig,
    df=pca_combined,
    smiles_col="product_1_smiles",
    color_col="source",
    # title_col="rxn_id",
)

app.run(jupyter_mode="inline")


JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.



In [89]:
import molplotly
import plotly.express as px

fig = px.scatter(
    tsne_combined,
    x="x",
    y="y",
    color="source",
    title="t-SNE",
    width=625,
    height=625,
)


app = molplotly.add_molecules(
    fig=fig,
    df=tsne_combined,
    smiles_col="product_1_smiles",
    color_col="source",
    caption_cols=["rxn_id", "cluster"],
)

app.run(jupyter_mode="inline")


JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.



In [81]:
fig = px.scatter(
    umap_combined,
    x="x",
    y="y",
    color="source",
    title="UMAP",
    width=625,
    height=625,
)

app = molplotly.add_molecules(
    fig=fig,
    df=umap_combined,
    smiles_col="product_1_smiles",
    color_col="source",
    # title_col="rxn_id",
)

app.run(jupyter_mode="inline")


JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.

