# Better Data Splits for Machine Learning with `astartes`

acyclic molecules and boiling point trainset0
https://brunl01.users.greyc.fr/CHEMISTRY/

molplotly

In [1]:
import pandas as pd

with open("iupac_and_boiling.csv", "r") as f:
    data = pd.read_csv(f, sep="   ", engine="python")
data


Unnamed: 0,iupac_name,boiling_point
0,ethylmethylether,10.80
1,diethylether,34.60
2,isopropylmethylether,32.00
3,butylmethylether,70.30
4,ethylpropylether,63.60
...,...,...
110,"2,2,5,5-tetramethylhexane",137.46
111,"2,2,5-trimethylheptane",148.00
112,"2,5,5-trimethylheptane",152.80
113,"2,2,6-trimethylheptane",148.20


use py2opsin to convert the IUPAC names into smiles strings

In [2]:
from py2opsin import py2opsin

data["smiles"] = py2opsin(data["iupac_name"])


In [3]:
from astartes.molecules import train_val_test_split
from rdkit.Chem.Descriptors import ExactMolWt
from rdkit.Chem import MolFromSmiles
import numpy as np

weights = []
for smi in data["smiles"]:
    weights.append(ExactMolWt(MolFromSmiles(smi)))
weights = np.array(weights)

tts_args = dict(
    y=data["boiling_point"].to_numpy(),
    train_size=0.8,
    test_size=0.1,
    val_size=0.1,
)

(
    X_train,
    X_val,
    X_test,
    y_train,
    y_val,
    y_test,
) = train_val_test_split(
    weights,
    **tts_args,
)


In [4]:
import plotly.graph_objects as go
import plotly.express as px
from scipy.optimize import curve_fit
from sklearn.metrics import mean_absolute_error

def model(x, a, b):
    return a**(x+b)


def scatter(X_train, X_val, X_test, y_train, y_val, y_test):
    fig = px.scatter(
        x=X_train,
        y=y_train,
    )
    params, _ = curve_fit(model, X_train, y_train)
    print("Validation MAE:", mean_absolute_error(y_val, model(X_val,*params)))
    print("Testing MAE:", mean_absolute_error(y_test, model(X_test,*params)))
    regression_xdata = np.arange(40,160,1)
    regression_y_data = np.array([model(i,*params) for i in regression_xdata])
    fig.add_trace(
        go.Scatter(
            x=regression_xdata,
            y=regression_y_data,
            mode="lines",
            name="model prediction",
            
        )
    )
    fig.add_trace(
        go.Scatter(
            x=X_val,
            y=y_val,
            mode="markers",
            name="validation",
        )
    )
    fig.add_trace(
        go.Scatter(
            x=X_test,
            y=y_test,
            mode="markers",
            name="testing",
        )
    )
    fig.update_layout(
        dict(
            plot_bgcolor="rgba(0, 0, 0, 0)",
            paper_bgcolor="rgba(0, 0, 0, 0)",
            height=800,
            width=800,
        ),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5,
        ),
    )
    ax_args = dict(
        showline=True,
        linewidth=2,
        linecolor="grey",
        gridwidth=1,
        showgrid=True,
        gridcolor="grey",
        zerolinecolor="grey",
        zerolinewidth=1,
    )
    fig.update_xaxes(title="Molecular Weight", **ax_args)
    fig.update_yaxes(title="Boiling Point (°C)", **ax_args)
    fig.show()

In [5]:
scatter(
    X_train,
    X_val,
    X_test,
    y_train,
    y_val,
    y_test,
)

Validation MAE: 10.352598175092146
Testing MAE: 9.478193874530506


In [6]:
(
    ks_X_train,
    ks_X_val,
    ks_X_test,
    ks_y_train,
    ks_y_val,
    ks_y_test,
) = train_val_test_split(
    weights.reshape(-1,1),
    sampler="kennard_stone",
    hopts=dict(
        metric="euclidean",
    ),
    **tts_args,
)
scatter(ks_X_train.flatten(),
    ks_X_val.flatten(),
    ks_X_test.flatten(),
    ks_y_train,
    ks_y_val,
    ks_y_test,)

Validation MAE: 9.166684669711474
Testing MAE: 6.091710058627203


In [7]:
(
    spex_X_train,
    spex_X_val,
    spex_X_test,
    spex_y_train,
    spex_y_val,
    spex_y_test,
    _,_,_
) = train_val_test_split(
    weights.reshape(-1, 1),
    sampler="sphere_exclusion",
    hopts=dict(
        metric="euclidean",
        distance_cutoff=0.1,
    ),
    **tts_args,
)
scatter(spex_X_train.flatten(),
    spex_X_val.flatten(),
    spex_X_test.flatten(),
    spex_y_train,
    spex_y_val,
    spex_y_test,)

Validation MAE: 18.258527918375204
Testing MAE: 24.15192257309957



Actual train/test split differs from requested size. Requested train size of 0.80, got 0.86. Requested validation size of 0.10, got 0.09. Requested test size of 0.10, got 0.09. 

