In [89]:
import pandas as pd
from pathlib import Path
from loguru import logger

In [90]:
task_name = "absorption"  # absorption, emission, quantum_yield, log_molar_absorptivity
dataset_name = "xanthene"  # consolidation, cyanine, xanthene
split_method = "random"  # random, scaffold
raw_data_dir = Path("../datasets/raw")
data_dir = Path("../datasets/")

In [91]:
def build_data(raw_data_path, data_path, task_name):
    df = pd.read_csv(raw_data_path)
    logger.info(f"before dropna: {len(df)}")
    df.dropna(subset=[task_name], inplace=True)
    
    df_new = pd.DataFrame()
    df_new["smiles"] = df["smiles"]
    df_new["solvent"] = df["solvent"]
    df_new[task_name] = df[task_name]
    df_new["split"] = df["split"]

    n_total = len(df_new)
    n_test = len(df_new[df_new["split"] == "test"])
    n_valid = len(df_new[df_new["split"] == "valid"])
    n_train = len(df_new[df_new["split"] == "train"])
    df_new.to_csv(data_path, index=False)
    logger.info(f"Dataset {dataset_name} with {n_total} samples, {n_test} test, {n_valid} valid, {n_train} train")

In [92]:
for fold in range(5):
    logger.info(f"Building {task_name} for {dataset_name} fold {fold}")
    task_dir = data_dir / split_method / f"{dataset_name}_fold{fold}" / f"{task_name}"
    task_dir.mkdir(parents=True, exist_ok=True)

    raw_data_path = raw_data_dir / split_method / f"{dataset_name}_fold{fold}.csv"
    data_path = task_dir / f"{task_name}.csv"
    build_data(raw_data_path, data_path, task_name)

[32m2025-04-05 18:54:21.007[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mBuilding absorption for xanthene fold 0[0m
[32m2025-04-05 18:54:21.024[0m | [1mINFO    [0m | [36m__main__[0m:[36mbuild_data[0m:[36m3[0m - [1mbefore dropna: 1376[0m


[32m2025-04-05 18:54:21.045[0m | [1mINFO    [0m | [36m__main__[0m:[36mbuild_data[0m:[36m17[0m - [1mDataset xanthene with 1370 samples, 229 test, 229 valid, 912 train[0m
[32m2025-04-05 18:54:21.048[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mBuilding absorption for xanthene fold 1[0m
[32m2025-04-05 18:54:21.061[0m | [1mINFO    [0m | [36m__main__[0m:[36mbuild_data[0m:[36m3[0m - [1mbefore dropna: 1375[0m
[32m2025-04-05 18:54:21.080[0m | [1mINFO    [0m | [36m__main__[0m:[36mbuild_data[0m:[36m17[0m - [1mDataset xanthene with 1370 samples, 229 test, 229 valid, 912 train[0m
[32m2025-04-05 18:54:21.082[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mBuilding absorption for xanthene fold 2[0m
[32m2025-04-05 18:54:21.094[0m | [1mINFO    [0m | [36m__main__[0m:[36mbuild_data[0m:[36m3[0m - [1mbefore dropna: 1375[0m
[32m2025-04-05 18:54:21.109[0m | [1mINFO    [0m | [36m__main__[0