In [21]:
import pandas as pd
from pathlib import Path
from molvs import standardize_smiles
from loguru import logger
from tqdm.auto import tqdm
from typing import Optional

In [22]:
def canonicalize_smiles(smiles: str) -> Optional[str]:
    try:
        return standardize_smiles(smiles)
    except Exception as e:
        logger.error(f"Error canonicalizing smiles: {smiles} with error: {e}")
        return None

In [23]:
raw_data_dir = Path("../datasets/raw")

data_consolidation = pd.read_csv(raw_data_dir / "Dataset_Consolidation.csv")
data_cyanine = pd.read_csv(raw_data_dir / "Dataset_Cyanine.csv")
data_xanthene = pd.read_csv(raw_data_dir / "Dataset_Xanthene.csv")

In [24]:
def canonicalize_df(df: pd.DataFrame, column: str) -> pd.DataFrame:
    tqdm.pandas(desc=f"canonicalizing {column}")
    logger.info(f"before canonicalizing # {column} in {df.shape[0]} rows")
    df[column] = df[column].progress_apply(canonicalize_smiles)
    df = df.dropna(subset=[column])
    logger.info(f"after canonicalizing # {column} in {df.shape[0]} rows")
    return df

In [25]:
data_consolidation = canonicalize_df(data_consolidation, "smiles")
data_consolidation = canonicalize_df(data_consolidation, "solvent")

[32m2025-04-03 17:34:38.067[0m | [1mINFO    [0m | [36m__main__[0m:[36mcanonicalize_df[0m:[36m3[0m - [1mbefore canonicalizing # smiles in 36756 rows[0m


canonicalizing smiles:   0%|          | 0/36756 [00:00<?, ?it/s]

[17:34:38] Can't kekulize mol.  Unkekulized atoms: 3 8
[17:34:38] Can't kekulize mol.  Unkekulized atoms: 3 6
[17:34:38] Can't kekulize mol.  Unkekulized atoms: 3 6
[17:34:38] Can't kekulize mol.  Unkekulized atoms: 3 8
[17:34:38] Can't kekulize mol.  Unkekulized atoms: 3 6
[17:34:40] Can't kekulize mol.  Unkekulized atoms: 3 11 12 15 16 19 20 21
[17:34:42] Explicit valence for atom # 37 Al, 2, is greater than permitted
[32m2025-04-03 17:34:42.072[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mcanonicalize_smiles[0m:[36m5[0m - [31m[1mError canonicalizing smiles: C=CCCCOc1cccc(C2=C3C=CC(=N3)C(c3cc(C(C)(C)C)cc(C(C)(C)C)c3)=c3ccc4n3[Al+3]n3c(ccc32)C(c2cc(C(C)(C)C)cc(C(C)(C)C)c2)=C2C=CC(=N2)C=4c2cccc(OCCCC=C)c2)c1 with error: Explicit valence for atom # 37 Al, 2, is greater than permitted[0m
[17:34:42] Explicit valence for atom # 15 Al, 2, is greater than permitted
[32m2025-04-03 17:34:42.300[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mcanonicalize_smiles[0m:[36m5

canonicalizing solvent:   0%|          | 0/36750 [00:00<?, ?it/s]

[32m2025-04-03 17:35:37.800[0m | [1mINFO    [0m | [36m__main__[0m:[36mcanonicalize_df[0m:[36m6[0m - [1mafter canonicalizing # solvent in 36750 rows[0m


In [26]:
data_cyanine = canonicalize_df(data_cyanine, "smiles")
data_cyanine = canonicalize_df(data_cyanine, "solvent")

[32m2025-04-03 17:35:37.845[0m | [1mINFO    [0m | [36m__main__[0m:[36mcanonicalize_df[0m:[36m3[0m - [1mbefore canonicalizing # smiles in 1496 rows[0m


canonicalizing smiles:   0%|          | 0/1496 [00:00<?, ?it/s]

[32m2025-04-03 17:35:39.860[0m | [1mINFO    [0m | [36m__main__[0m:[36mcanonicalize_df[0m:[36m6[0m - [1mafter canonicalizing # smiles in 1496 rows[0m
[32m2025-04-03 17:35:39.862[0m | [1mINFO    [0m | [36m__main__[0m:[36mcanonicalize_df[0m:[36m3[0m - [1mbefore canonicalizing # solvent in 1496 rows[0m


canonicalizing solvent:   0%|          | 0/1496 [00:00<?, ?it/s]

[32m2025-04-03 17:35:40.280[0m | [1mINFO    [0m | [36m__main__[0m:[36mcanonicalize_df[0m:[36m6[0m - [1mafter canonicalizing # solvent in 1496 rows[0m


In [27]:
data_xanthene = canonicalize_df(data_xanthene, "smiles")
data_xanthene = canonicalize_df(data_xanthene, "solvent")

[32m2025-04-03 17:35:40.341[0m | [1mINFO    [0m | [36m__main__[0m:[36mcanonicalize_df[0m:[36m3[0m - [1mbefore canonicalizing # smiles in 1152 rows[0m


canonicalizing smiles:   0%|          | 0/1152 [00:00<?, ?it/s]

[32m2025-04-03 17:35:41.745[0m | [1mINFO    [0m | [36m__main__[0m:[36mcanonicalize_df[0m:[36m6[0m - [1mafter canonicalizing # smiles in 1152 rows[0m
[32m2025-04-03 17:35:41.748[0m | [1mINFO    [0m | [36m__main__[0m:[36mcanonicalize_df[0m:[36m3[0m - [1mbefore canonicalizing # solvent in 1152 rows[0m


canonicalizing solvent:   0%|          | 0/1152 [00:00<?, ?it/s]

[32m2025-04-03 17:35:42.062[0m | [1mINFO    [0m | [36m__main__[0m:[36mcanonicalize_df[0m:[36m6[0m - [1mafter canonicalizing # solvent in 1152 rows[0m


In [28]:
data_consolidation.to_csv(raw_data_dir / "Dataset_Consolidation_canonicalized.csv", index=False)
data_cyanine.to_csv(raw_data_dir / "Dataset_Cyanine_canonicalized.csv", index=False)
data_xanthene.to_csv(raw_data_dir / "Dataset_Xanthene_canonicalized.csv", index=False)