# Better Data Splits for Machine Learning with `astartes`

acyclic molecules and boiling point trainset0
https://brunl01.users.greyc.fr/CHEMISTRY/

molplotly

In [1]:
import pandas as pd

with open("iupac_and_boiling.csv", "r") as f:
    data = pd.read_csv(f, sep="   ", engine="python")
data


Unnamed: 0,iupac_name,boiling_point
0,ethylmethylether,10.80
1,diethylether,34.60
2,isopropylmethylether,32.00
3,butylmethylether,70.30
4,ethylpropylether,63.60
...,...,...
122,"2,2,5,5-tetramethylhexane",137.46
123,"2,2,5-trimethylheptane",148.00
124,"2,5,5-trimethylheptane",152.80
125,"2,2,6-trimethylheptane",148.20


use py2opsin to convert the IUPAC names into smiles strings

In [2]:
from py2opsin import py2opsin

data["smiles"] = py2opsin(data["iupac_name"])


In [3]:
from astartes.molecules import train_val_test_split_molecules

import numpy as np

tts_args = dict(
    train_size=0.8,
    test_size=0.1,
    val_size=0.1,
    return_indices=True,
    fingerprint='maccs_keys',
)

(
    molecules_train,
    molecule_val,
    molecules_test,
    idxs_train,
    idxs_val,
    idxs_test,
) = train_val_test_split_molecules(
    data["smiles"].to_numpy(),
    **tts_args,
)


  warn(


In [4]:
from sklearn.decomposition import PCA


all_indxs = np.hstack(
    (
        idxs_train,
        idxs_val,
        idxs_test,
    )
)

pca = PCA(
    n_components=3,
    random_state=42,
)
embedding = pca.fit_transform(
    np.vstack(
        (
            molecules_train,
            molecule_val,
            molecules_test,
        )
    ),
    y=data["boiling_point"].to_numpy()[all_indxs],
)
embedding_df = pd.DataFrame(embedding, columns=["PC1", "PC2", "PC3"])


Now that we have the data ready to go, let's plot it in 3D and see what it looks like.
To do so, we will use [`plotly`](https://plotly.com/python-api-reference/).
First let's write a nice wrapper function that will take in our data points and give back a nice 3D scatter plot that we can rotate and zoom in on.
If you're curious, go ahead and read through the code below - otherwise just remember that we have a function `scatter3d` that makes a nice looking 3D scatter plot.

In [5]:
import plotly.express as px
import molplotly
import warnings


def scatter3d(idxs_train, idxs_val, idxs_test, plot_port):
    split_membership = np.empty(len(data), dtype=object)
    split_membership[idxs_val] = "val"
    split_membership[idxs_train] = "train"
    split_membership[idxs_test] = "test"
    all_indxs = np.hstack(
        (
            idxs_train,
            idxs_val,
            idxs_test,
        )
    )
    split_df = pd.DataFrame.from_dict(
        {
            "PC1": embedding_df["PC1"].to_numpy()[all_indxs],
            "PC2": embedding_df["PC2"].to_numpy()[all_indxs],
            "PC3": embedding_df["PC3"].to_numpy()[all_indxs],
            "SMILES": data["smiles"].to_numpy()[all_indxs],
            "Boiling Point (°C)": data["boiling_point"].to_numpy()[all_indxs],
            "split": split_membership,
            "iupac_name": data["iupac_name"].to_numpy()[all_indxs],
        }
    )
    fig = px.scatter_3d(
        data_frame=split_df,
        x="PC1",
        y="PC2",
        z="PC3",
        opacity=0.8,
        color="split",
        width=1000,
        height=1000,
    )
    fig.update_traces(
        marker=dict(size=4),
        selector=dict(mode="markers"),
        hovertemplate=None,  # moltplotly break without this
    )
    axis_args = dict(
        backgroundcolor="rgba(0, 0, 0,0)",
        gridcolor="grey",
        showbackground=True,
        zerolinecolor="grey",
        showticklabels=True,
    )
    fig.update_layout(
        dict(
            plot_bgcolor="rgba(0, 0, 0, 0)",
            paper_bgcolor="rgba(0, 0, 0, 0)",
        ),
        scene=dict(
            xaxis=axis_args,
            yaxis=axis_args,
            zaxis=axis_args,
            xaxis_title="PC1",
            yaxis_title="PC2",
            zaxis_title="PC3",
        ),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5,
        ),
    )
    consistent_colors = {"train": "grey", "val": "red", "test": "blue"}
    for i, d in enumerate(fig.data):
        fig.data[i].marker.color = consistent_colors[fig.data[i].name]
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)
        app = molplotly.add_molecules(
            fig=fig,
            df=split_df,
            show_coords=False,
            color_col="split",
            caption_cols=["Boiling Point (°C)"],
            title_col="iupac_name",
        )
    app.run_server(mode="inline", port=plot_port, height=1000, width=1000)

In [6]:
scatter3d(idxs_train, idxs_val, idxs_test, 8700)


Dash is running on http://127.0.0.1:8700/



This embedding clusters the data into heavy alkanes with quaternary carbons, heavy alkanes with sub-quaternary carbons, light linear alkanes, and gases.

Dataset partitioning is done with the entire fingerprint, this project is just for our understanding.

In [7]:
from astartes import train_val_test_split
embedding_split_args = dict(
    train_size=0.8,
    test_size=0.1,
    val_size=0.1,
    return_indices=True,
)

(
    _,
    _,
    _,
    idxs_train,
    idxs_val,
    idxs_test,
) = train_val_test_split(
    embedding,
    sampler="kennard_stone",
    hopts=dict(
        metric="euclidean",
    ),
    **embedding_split_args,
)
scatter3d(idxs_train, idxs_val, idxs_test, 8701)

Dash is running on http://127.0.0.1:8701/




Actual train/test split differs from requested size. Requested validation size of 0.10, got 0.11. Requested test size of 0.10, got 0.11. 



In [8]:
(
    _,
    _,
    _,
    _,
    _,
    _,
    idxs_train,
    idxs_val,
    idxs_test,
) = train_val_test_split(
    embedding,
    data["boiling_point"].to_numpy(),
    sampler="spxy",
    hopts=dict(
        metric="euclidean",
    ),
    **embedding_split_args,
)
scatter3d(idxs_train, idxs_val, idxs_test, 8702)

Dash is running on http://127.0.0.1:8702/




Actual train/test split differs from requested size. Requested validation size of 0.10, got 0.11. Requested test size of 0.10, got 0.11. 



In [11]:
(
    _,
    _,
    _,
    clsuters_train,
    clusters_val,
    clusters_test,
    idxs_train,
    idxs_val,
    idxs_test,
) = train_val_test_split(
    embedding,
    sampler="sphere_exclusion",
    hopts=dict(
        # n_clusters=15,
        metric="euclidean",
        distance_cutoff=0.1,
    ),
    **embedding_split_args,
)
scatter3d(idxs_train, idxs_val, idxs_test, 8703)


Actual train/test split differs from requested size. Requested train size of 0.80, got 0.82. Requested validation size of 0.10, got 0.09. Requested test size of 0.10, got 0.09. 



Dash is running on http://127.0.0.1:8703/

