In [None]:
from pathlib import Path

from autogluon.tabular import TabularDataset, TabularPredictor
from mordred import Calculator, descriptors
import pandas as pd
from rdkit.Chem import MolFromSmiles

In [2]:
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)

In [3]:
calc = Calculator(descriptors, ignore_3D=True)

In [None]:
train_file = data_dir / "train.parquet"
if not train_file.exists():
    train_data = pd.read_csv(data_dir / "AqSolDBc.csv")
    train_data["rdkit_mol"] = train_data["SmilesCurated"].apply(MolFromSmiles)
    train_data = train_data.dropna(axis=0, subset=["rdkit_mol"])
    train_descs: pd.DataFrame = calc.pandas(train_data["rdkit_mol"]).fill_missing()
    train_df = pd.concat((train_data[["ExperimentalLogS"]], train_descs), axis=1)
    train_df.to_parquet(train_file)
else:
    train_df = pd.read_parquet(train_file)

In [None]:
test_file = data_dir / "test.parquet"
if not test_file.exists():
    test_data = pd.read_csv(data_dir / "OChemUnseen.csv")
    test_data["rdkit_mol"] = test_data["SMILES"].apply(MolFromSmiles)
    test_data = test_data.dropna(axis=0, subset=["rdkit_mol"])
    test_descs: pd.DataFrame = calc.pandas(test_data["rdkit_mol"]).fill_missing()
    test_df = pd.concat((test_data[["LogS"]], test_descs), axis=1)
    test_df.to_parquet(test_file)
else:
    test_df = pd.read_parquet(test_file)

In [21]:
train_data = TabularDataset(train_df)
test_data = TabularDataset(test_df)

In [None]:
predictor = TabularPredictor(label="ExperimentalLogS", log_to_file=True).fit(train_data, num_gpus=1)

In [None]:
predictor.evaluate(test_data)