# Better Data Splits for Machine Learning with `astartes`

We use a downsampled version of the Alkane Boiling Point dataset found at this URL: https://brunl01.users.greyc.fr/CHEMISTRY/

Some of the constitutional isomers have been removed for simplicity in plotting.

Start by loading the data using pandas.

In [1]:
import pandas as pd

with open("iupac_and_boiling.csv", "r") as f:
    data = pd.read_csv(f, sep="   ", engine="python")
data


Unnamed: 0,idx,filename,boiling_point,iupac_name
0,1,molecule001.ct,-164.0,methane
1,2,molecule002.ct,-88.6,ethane
2,3,molecule003.ct,-42.1,propane
3,4,molecule004.ct,-11.7,2-methylpropane
4,5,molecule005.ct,-0.5,butane
5,6,molecule006.ct,9.5,"2,2-dimethylpropane"
6,7,molecule007.ct,27.8,2-methylbutane
7,8,molecule008.ct,36.1,pentane
8,9,molecule009.ct,49.7,"2,2-dimethylbutane"
9,10,molecule010.ct,58.0,"2,3-dimethylbutane"


Use [py2opsin](https://github.com/JacksonBurns/py2opsin) to convert the IUPAC names into smiles strings:

In [2]:
from py2opsin import py2opsin

data["smiles"] = py2opsin(data["iupac_name"])


Use RDKit to get the molecular weight as our input feature.

In [3]:
from rdkit.Chem.Descriptors import ExactMolWt
from rdkit.Chem import MolFromSmiles
import numpy as np

weights = []
for smi in data["smiles"]:
    weights.append(ExactMolWt(MolFromSmiles(smi)))
weights = np.array(weights)

Now we use `train_val_test_split` to partition this data using a random sampler, and then using the SPXY and Sphere Exclusion algorithms.

In [4]:
from astartes.molecules import train_val_test_split


tts_args = dict(
    y=data["boiling_point"].to_numpy(),
    train_size=0.8,
    test_size=0.1,
    val_size=0.1,
)

(
    X_train,
    X_val,
    X_test,
    y_train,
    y_val,
    y_test,
) = train_val_test_split(
    weights,
    **tts_args,
)


  warn(


To plot the results of each sampling approach, we define `scatter`.

In [5]:
import plotly.graph_objects as go
import plotly.express as px
from scipy.optimize import curve_fit
from sklearn.metrics import mean_absolute_error, r2_score


def model(x, m, b):
    return m * x + b


def scatter(X_train, X_val, X_test, y_train, y_val, y_test):
    fig = px.scatter(
        x=X_train,
        y=y_train,
    )
    fig.update_traces(
        marker=dict(
            size=8,
            color="grey",
            opacity=0.6,
        )
    )
    params, _ = curve_fit(model, X_train, y_train)
    regression_xdata = np.arange(16, 145, 1)
    regression_y_data = np.array([model(i, *params) for i in regression_xdata])
    fig.add_trace(
        go.Scatter(
            x=regression_xdata,
            y=regression_y_data,
            mode="lines",
            name="model prediction",
            line=dict(
                color="black",
            )
        )
    )
    fig.add_trace(
        go.Scatter(
            x=X_val,
            y=y_val,
            mode="markers",
            marker=dict(
                size=12,
                color="red",
            ),
            name="validation",
        )
    )
    fig.add_trace(
        go.Scatter(
            x=X_test,
            y=y_test,
            mode="markers",
            marker=dict(
                size=12,
                color="blue",
            ),
            name="testing",
        )
    )
    fig.update_layout(
        dict(
            plot_bgcolor="rgba(0, 0, 0, 0)",
            paper_bgcolor="rgba(0, 0, 0, 0)",
            height=600,
            width=600,
        ),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5,
        ),
    )
    ax_args = dict(
        showline=True,
        linewidth=2,
        linecolor="grey",
        gridwidth=1,
        showgrid=True,
        gridcolor="rgba(0, 0, 0, 0.15)",
        zerolinecolor="grey",
        zerolinewidth=1,
        griddash='dash',
        layer='below traces',
    )
    fig.update_xaxes(title="Molecular Weight", **ax_args)
    fig.update_yaxes(title="Boiling Point (°C)", **ax_args)
    fig.add_annotation(
        x=50,
        y=150,
        text="Training MAE: {:.2f}<br>Validation MAE: {:.2f}<br>Testing MAE: {:.2f}<br>".format(
            mean_absolute_error(y_train, model(X_train, *params)),
            mean_absolute_error(y_val, model(X_val, *params)),
            mean_absolute_error(y_test, model(X_test, *params)),
        ),
        showarrow=False,
        font=dict(
            size=16,
        ),
        bgcolor="rgba(0, 0, 0, 0.1)",
        bordercolor="rgba(0, 0, 0, 1)",
        borderwidth=1,
    )
    fig.add_annotation(
        x=100,
        y=-100,
        text="Model:<br>BP={:.2f}*MW{:.2f}<br>R<sup>2</sup>={:.2f}<br>".format(
            params[0],
            params[1],
            r2_score(y_train, model(X_train, *params)),
        ),
        showarrow=False,
        font=dict(
            size=16,
        ),
        bgcolor="rgba(0, 0, 0, 0.1)",
        bordercolor="rgba(0, 0, 0, 1)",
    )
    fig.show()

In [6]:
scatter(
    X_train,
    X_val,
    X_test,
    y_train,
    y_val,
    y_test,
)

In [7]:
(
    ks_X_train,
    ks_X_val,
    ks_X_test,
    ks_y_train,
    ks_y_val,
    ks_y_test,
) = train_val_test_split(
    weights.reshape(-1, 1),
    sampler="spxy",
    hopts=dict(
        metric="euclidean",
    ),
    **tts_args,
)
scatter(
    ks_X_train.flatten(),
    ks_X_val.flatten(),
    ks_X_test.flatten(),
    ks_y_train,
    ks_y_val,
    ks_y_test,
)



Actual train/test split differs from requested size. Requested train size of 0.80, got 0.79. Requested validation size of 0.10, got 0.12. Requested test size of 0.10, got 0.12. 



In [8]:
(
    spex_X_train,
    spex_X_val,
    spex_X_test,
    spex_y_train,
    spex_y_val,
    spex_y_test,
    _,
    _,
    _,
) = train_val_test_split(
    weights.reshape(-1, 1),
    sampler="sphere_exclusion",
    hopts=dict(
        metric="euclidean",
        distance_cutoff=0.1,
    ),
    **tts_args,
)
scatter(
    spex_X_train.flatten(),
    spex_X_val.flatten(),
    spex_X_test.flatten(),
    spex_y_train,
    spex_y_val,
    spex_y_test,
)



Actual train/test split differs from requested size. Requested train size of 0.80, got 0.85. Requested validation size of 0.10, got 0.09. Requested test size of 0.10, got 0.09. 

