In [17]:
from rdkit import Chem

def check_smiles_validity(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=True)
        if mol is None:
            print(f"Invalid SMILES (MolFromSmiles returned None): {smiles}")
            return False
        return True
    except Exception as e:
        print(f"Invalid SMILES (exception raised): {smiles} | Error: {e}")
        return False

with open('/home/akshatz/bond_order_free/multi_molecule/dataset/mult_mol_data.csv') as f:
    next(f)  # skip header
    for line in f:
        smi = line.strip().split(',')[0] 
        if not check_smiles_validity(smi):
            print(f" --> {smi} is invalid")


[14:39:25] Explicit valence for atom # 4 C, 5, is greater than permitted


Invalid SMILES (MolFromSmiles returned None): CN1C=C[CH2+]=CC1
 --> CN1C=C[CH2+]=CC1 is invalid




In [18]:
import csv
import json
from rdkit import Chem
from pathlib import Path

def check_smiles_validity(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=True)
        return mol is not None
    except Exception:
        return False

def process_dataset_and_splits(data_path, splits_path, output_data_path=None, invalid_smiles_path=None, updated_splits_path=None):
    # Set default output paths
    data_path = Path(data_path)
    splits_path = Path(splits_path)
    
    if output_data_path is None:
        output_data_path = data_path.with_name(data_path.stem + '_filtered.csv')
    if invalid_smiles_path is None:
        invalid_smiles_path = data_path.with_name(data_path.stem + '_invalid.csv')
    if updated_splits_path is None:
        updated_splits_path = splits_path.with_name(splits_path.stem + '_filtered.json')

    # Step 1: Read and filter valid rows
    valid_rows = []
    invalid_rows = []
    with open(data_path, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
        for idx, row in enumerate(reader):
            smi = row[0]
            if check_smiles_validity(smi):
                valid_rows.append((idx, row))
            else:
                invalid_rows.append((idx, row))

    # Step 2: Write filtered valid data
    with open(output_data_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        for _, row in valid_rows:
            writer.writerow(row)

    # Step 3: Write invalid rows
    with open(invalid_smiles_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        for _, row in invalid_rows:
            writer.writerow(row)

    # Step 4: Adjust splits.json
    old_to_new_index = {}
    for new_idx, (old_idx, _) in enumerate(valid_rows):
        old_to_new_index[old_idx] = new_idx

    with open(splits_path, 'r') as f:
        old_splits = json.load(f)

    new_splits = []
    for split_dict in old_splits:
        new_split_dict = {}
        for key in ['train', 'val', 'test']:
            start, end = map(int, split_dict[key].split('-'))
            # Keep only valid indices within this range
            new_indices = [
                old_to_new_index[i]
                for i in range(start, end + 1)
                if i in old_to_new_index
            ]
            if new_indices:
                new_split_dict[key] = f"{min(new_indices)}-{max(new_indices)}"
            else:
                new_split_dict[key] = "0--1"  # Placeholder if empty
        new_splits.append(new_split_dict)

    with open(updated_splits_path, 'w') as f:
        json.dump(new_splits, f, indent=4)

    print(f"Filtered data saved to: {output_data_path}")
    print(f"Invalid SMILES saved to: {invalid_smiles_path}")
    print(f"Updated splits saved to: {updated_splits_path}")


In [19]:
# mult_mol uv/vis
process_dataset_and_splits(
    data_path="/home/akshatz/bond_order_free/multi_molecule/dataset/mult_mol_data.csv",
    splits_path="/home/akshatz/bond_order_free/multi_molecule/dataset/splits.json"
)

[14:39:40] Explicit valence for atom # 4 C, 5, is greater than permitted


Filtered data saved to: /home/akshatz/bond_order_free/multi_molecule/dataset/mult_mol_data_filtered.csv
Invalid SMILES saved to: /home/akshatz/bond_order_free/multi_molecule/dataset/mult_mol_data_invalid.csv
Updated splits saved to: /home/akshatz/bond_order_free/multi_molecule/dataset/splits_filtered.json




In [20]:
#hiv
process_dataset_and_splits(
    data_path="/home/akshatz/bond_order_free/hiv/dataset/hiv_data.csv",
    splits_path="/home/akshatz/bond_order_free/hiv/dataset/splits.json"
)

[14:40:08] Explicit valence for atom # 5 B, 5, is greater than permitted
[14:40:09] Explicit valence for atom # 3 Al, 6, is greater than permitted
[14:40:12] Explicit valence for atom # 6 Ge, 5, is greater than permitted
[14:40:12] Explicit valence for atom # 13 Al, 7, is greater than permitted
[14:40:12] Explicit valence for atom # 12 Al, 7, is greater than permitted
[14:40:12] Explicit valence for atom # 4 Al, 9, is greater than permitted


Filtered data saved to: /home/akshatz/bond_order_free/hiv/dataset/hiv_data_filtered.csv
Invalid SMILES saved to: /home/akshatz/bond_order_free/hiv/dataset/hiv_data_invalid.csv
Updated splits saved to: /home/akshatz/bond_order_free/hiv/dataset/splits_filtered.json


[14:40:13] Explicit valence for atom # 16 Al, 9, is greater than permitted


In [21]:
#qm9
process_dataset_and_splits(
    data_path="/home/akshatz/bond_order_free/qm9/dataset/qm9_data.csv",
    splits_path="/home/akshatz/bond_order_free/qm9/dataset/splits.json"
)

Filtered data saved to: /home/akshatz/bond_order_free/qm9/dataset/qm9_data_filtered.csv
Invalid SMILES saved to: /home/akshatz/bond_order_free/qm9/dataset/qm9_data_invalid.csv
Updated splits saved to: /home/akshatz/bond_order_free/qm9/dataset/splits_filtered.json


In [22]:
#pcba random
process_dataset_and_splits(
    data_path="/home/akshatz/bond_order_free/pcba_random/dataset/pcba_random_data.csv",
    splits_path="/home/akshatz/bond_order_free/pcba_random/dataset/splits.json"
)

[14:40:40] Explicit valence for atom # 5 Al, 6, is greater than permitted
[14:40:56] Explicit valence for atom # 4 Al, 5, is greater than permitted


Filtered data saved to: /home/akshatz/bond_order_free/pcba_random/dataset/pcba_random_data_filtered.csv
Invalid SMILES saved to: /home/akshatz/bond_order_free/pcba_random/dataset/pcba_random_data_invalid.csv
Updated splits saved to: /home/akshatz/bond_order_free/pcba_random/dataset/splits_filtered.json


In [23]:
#pcba random nan
process_dataset_and_splits(
    data_path="/home/akshatz/bond_order_free/pcba_random_nan/dataset/pcba_random_nan_data.csv",
    splits_path="/home/akshatz/bond_order_free/pcba_random_nan/dataset/splits.json"
)

[14:41:42] Explicit valence for atom # 4 Al, 5, is greater than permitted
[14:42:07] Explicit valence for atom # 5 Al, 6, is greater than permitted


Filtered data saved to: /home/akshatz/bond_order_free/pcba_random_nan/dataset/pcba_random_nan_data_filtered.csv
Invalid SMILES saved to: /home/akshatz/bond_order_free/pcba_random_nan/dataset/pcba_random_nan_data_invalid.csv
Updated splits saved to: /home/akshatz/bond_order_free/pcba_random_nan/dataset/splits_filtered.json


In [24]:
#pcba scaffold
process_dataset_and_splits(
    data_path="/home/akshatz/bond_order_free/pcba_scaffold/dataset/pcba_scaffold_data.csv",
    splits_path="/home/akshatz/bond_order_free/pcba_scaffold/dataset/splits.json"
)

[14:42:50] Explicit valence for atom # 4 Al, 5, is greater than permitted
[14:43:22] Explicit valence for atom # 5 Al, 6, is greater than permitted


Filtered data saved to: /home/akshatz/bond_order_free/pcba_scaffold/dataset/pcba_scaffold_data_filtered.csv
Invalid SMILES saved to: /home/akshatz/bond_order_free/pcba_scaffold/dataset/pcba_scaffold_data_invalid.csv
Updated splits saved to: /home/akshatz/bond_order_free/pcba_scaffold/dataset/splits_filtered.json


In [26]:
#pcqm4mv2
process_dataset_and_splits(
    data_path="/home/akshatz/bond_order_free/pcqm4mv2/dataset/pcqm4mv2_data.csv",
    splits_path="/home/akshatz/bond_order_free/pcqm4mv2/dataset/splits.json"
)

[14:44:08] Explicit valence for atom # 1 Si, 6, is greater than permitted
[14:44:13] Explicit valence for atom # 4 Si, 5, is greater than permitted
[14:44:15] Explicit valence for atom # 1 Si, 5, is greater than permitted
[14:44:25] Explicit valence for atom # 1 Si, 5, is greater than permitted
[14:44:26] Explicit valence for atom # 6 Si, 5, is greater than permitted
[14:44:29] Explicit valence for atom # 4 P, 6, is greater than permitted
[14:44:38] Explicit valence for atom # 2 Si, 5, is greater than permitted
[14:44:53] Explicit valence for atom # 1 Si, 5, is greater than permitted
[14:45:02] Conflicting single bond directions around double bond at index 13.
[14:45:02]   BondStereo set to STEREONONE and single bond directions set to NONE.
[14:45:10] Explicit valence for atom # 1 Si, 5, is greater than permitted
[14:45:22] Explicit valence for atom # 2 Si, 5, is greater than permitted
[14:45:23] Explicit valence for atom # 1 Si, 5, is greater than permitted
[14:45:39] Explicit valence

Filtered data saved to: /home/akshatz/bond_order_free/pcqm4mv2/dataset/pcqm4mv2_data_filtered.csv
Invalid SMILES saved to: /home/akshatz/bond_order_free/pcqm4mv2/dataset/pcqm4mv2_data_invalid.csv
Updated splits saved to: /home/akshatz/bond_order_free/pcqm4mv2/dataset/splits_filtered.json


In [27]:
# sampl all_data
process_dataset_and_splits(
    data_path="/home/akshatz/bond_order_free/logp/dataset/logP/data.csv",
    splits_path="/home/akshatz/bond_order_free/logp/dataset/logP/splits.json"
)

Filtered data saved to: /home/akshatz/bond_order_free/logp/dataset/logP/data_filtered.csv
Invalid SMILES saved to: /home/akshatz/bond_order_free/logp/dataset/logP/data_invalid.csv
Updated splits saved to: /home/akshatz/bond_order_free/logp/dataset/logP/splits_filtered.json


In [28]:
# sampl without_overlap
process_dataset_and_splits(
    data_path="/home/akshatz/bond_order_free/logp/dataset/logP/logP_without_overlap.csv",
    splits_path="/home/akshatz/bond_order_free/logp/dataset/logP/splits.json"
)

Filtered data saved to: /home/akshatz/bond_order_free/logp/dataset/logP/logP_without_overlap_filtered.csv
Invalid SMILES saved to: /home/akshatz/bond_order_free/logp/dataset/logP/logP_without_overlap_invalid.csv
Updated splits saved to: /home/akshatz/bond_order_free/logp/dataset/logP/splits_filtered.json
