In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem
from group_selfies import(
    fragment_mols,
    Group,
    MolecularGraph,
    GroupGrammar,
    group_encoder
)
from rdkit.Chem.Draw import IPythonConsole
from rdkit import RDLogger
from psmiles import PolymerSmiles as PS
import numpy as np
import pandas as pd
import selfies as sf
RDLogger.DisableLog('rdApp.*')
import re
from group_selfies import grammar_rules as gr
gr._PROCESS_ATOM_CACHE.clear()

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
df = pd.read_csv("simulation-trajectory-aggregate.csv")
df.head()

Unnamed: 0,Trajectory ID,SMILES,Molality,Monomer Molecular Weight,Degree of Polymerization,Density,CONDUCTIVITY,TFSI Diffusivity,Li Diffusivity,Poly Diffusivity,Transference Number
0,9425,COCC(CNCC(CF)OC(=O)*)O*,1.4,468.0,19.0,1.35,7.6e-05,3.23e-08,1.26e-08,1.53e-08,-0.163
1,9426,O=C(CCNC(=O)COC(=O)*)NCCN*,1.47,476.0,13.0,1.46,6.9e-05,1.32e-08,1.11e-08,8.64e-09,0.318
2,9427,NC(=O)C(COC(=O)*)NC(=O)CCO*,1.44,463.0,17.0,1.53,0.000104,1.4e-08,1.95e-08,9.18e-09,0.53
3,9428,CC(COC(=O)*)COC(=O)C(C)(C)CO*,1.43,477.0,16.0,1.28,2.7e-05,2.42e-08,1.19e-08,1.4e-08,-0.319
4,9429,COC(=O)CC(=O)NC(CO*)COC(=O)*,1.47,478.0,26.0,1.49,3.9e-05,1.35e-08,6.58e-09,8.67e-09,0.0806


In [5]:
smiles = df.iloc[:,1].values
ess = GroupGrammar.essential_set()
psmiles = [PS(s).canonicalize.psmiles for s in smiles]
gpselfies = [ess.full_encoder(Chem.MolFromSmiles(s)) for s in psmiles]

In [6]:
from rdkit import Chem
from group_selfies import group_grammar
from group_selfies.group_decoder import (
    _tokenize_selfies, Counter,
    selfies_to_graph_iterative, form_rings_bilocally_iterative
)

def decode_keep_star(grammar, selfies, sanitize=False, verbose=False):
    """
    Group SELFIES → RDKit Mol, but KEEP '*' dummy atoms (do not H-cap).
    """
    rings = []
    place_from_idx = {}
    inverse_place = []
    dummy_counter = Counter(1)
    group_atom = {}

    mol = selfies_to_graph_iterative(
        grammar=grammar,
        symbol_iter=_tokenize_selfies(selfies),
        selfies=selfies,
        rings=rings,
        dummy_counter=dummy_counter,
        place_from_idx=place_from_idx,
        inverse_place=inverse_place,
        verbose=verbose,
        group_atom=group_atom,
    )
    form_rings_bilocally_iterative(
        mol, rings, place_from_idx, inverse_place,
        dummy_counter, group_atom, verbose=verbose
    )

    res = mol.GetMol()  # convert RWMol→Mol
    if sanitize:
        # 기본 Sanitize는 '*'에도 대체로 안전하지만 필요시 제약 완화
        Chem.SanitizeMol(res, sanitizeOps=Chem.SanitizeFlags.SANITIZE_NONE)
    return res
psmiles_dec = [decode_keep_star(ess, s) for s in gpselfies]
print(psmiles_dec)

[<rdkit.Chem.rdchem.Mol object at 0x7fdb405f68f0>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405f6d50>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405f5b60>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405bace0>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405bb610>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405bab20>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405ba490>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405ba5e0>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405b9310>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405bb680>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405ba6c0>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405bb760>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405bb6f0>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405ba8f0>, <rdkit.Chem.rdchem.Mol object at 0x7fdb405b93f0>, <rdkit.Chem.rdchem.Mol object at 0x7fdb40472260>, <rdkit.Chem.rdchem.Mol object at 0x7fdb40470dd0>, <rdkit.Chem.rdchem.Mol object at 0x7fdb404722d0>, <rdkit.Chem.rdchem.Mol object at 0x7fdb40470f90>, <rdkit.Chem.rdchem.Mol object at 0x7fdb40471540>,

In [7]:
psmiless = [PS(Chem.MolToSmiles(s)).canonicalize.psmiles for s in psmiles_dec]
print(psmiless)

['[*]CNCC(CF)OC(=O)OC([*])COC', '[*]CCNC(=O)COC(=O)NCCNC([*])=O', '[*]COC(=O)OCCC(=O)NC([*])C(N)=O', '[*]COC(=O)OCC(C)(C)C(=O)OCC([*])C', '[*]COC(=O)OCC([*])NC(=O)CC(=O)OC', '[*]COC(=O)OC([*])CNC(=O)CC(=O)OC', '[*]OC(=O)OC(C)(C)C(C)NC(=O)C([*])C', '[*]CCN(C)C(=O)CCNC(=O)OC([*])C', '[*]CNC(COC(=O)OC([*])C)C(N)=O', '[*]COC(=O)OC([*])CNC(=O)NC(C)C', '[*]COC(=O)OCC([*])(C)NC(=O)CC#N', '[*]CNCC(C)OC(=O)OC([*])CSC', '[*]NC(=O)NCCNC(=O)C([*])CC#C', '[*]CC(C)OC(=O)OCC(C)N([*])CCC', '[*]COC(=O)OCCC(=O)NC([*])CCOC', '[*]CN(C)CC(C)(OC(=O)OC([*])C)C(F)F', '[*]CCCCOC(=O)NCCN([*])CC', '[*]COC(=O)NC([*])C(=O)NCCOCC', '[*]CCOC(=O)OC(C)(C)CCN([*])CCC', '[*]COC(=O)OC([*])CNC(CC)CCC', '[*]COC(=O)OCC([*])NC(=S)NCOC', '[*]C(=O)N(C)CCOC(=O)OC(C)C([*])CC', '[*]CCNC(=O)NCCN([*])C(CC)CCC', '[*]COC(=O)OCC([*])NC(=O)OC', '[*]COC(=O)OCCC(C)(C)NC(=O)C([*])=C', '[*]COC(=O)OCC([*])NC(=O)CCCCC', '[*]OC(=O)OCCN(C)C(=O)C([*])C(C)C', '[*]CCOC(=O)OCCC(=O)N([*])C', '[*]COC(=O)OC([*])CN(C)C(C)C(=O)OC', '[*]COC(=O)OC(C)C([*