In [6]:
from rdkit import Chem
import numpy as np

Celecoxib = "Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1"
Troglitazone = "Cc1c(C)c2c(c(C)c1O)CCC(C)(COc1ccc(CC3SC(=O)NC3=O)cc1)O2"
Thiothixene = "CN1CCN(CC/C=C2/c3ccccc3Sc3ccc(S(=O)(=O)N(C)C)cc32)CC1"

def randomize_smiles(smiles):
    # randomize SMILES for data augmentation
    mol = Chem.MolFromSmiles(smiles)
    if mol == None:
        return smiles
    ans = list(range(mol.GetNumAtoms()))
    if ans == []:
        return smiles
    np.random.shuffle(ans)
    new_mol = Chem.RenumberAtoms(mol, ans)
    return Chem.MolToSmiles(new_mol, canonical=False)

In [4]:
for i in range(4):
    print(randomize_smiles(Celecoxib))

c1(-c2ccc(C)cc2)n(-c2ccc(S(N)(=O)=O)cc2)nc(C(F)(F)F)c1
c1c(S(N)(=O)=O)ccc(-n2nc(C(F)(F)F)cc2-c2ccc(C)cc2)c1
c1(-n2nc(C(F)(F)F)cc2-c2ccc(C)cc2)ccc(S(=O)(=O)N)cc1
c1(C(F)(F)F)cc(-c2ccc(C)cc2)n(-c2ccc(S(=O)(N)=O)cc2)n1


In [7]:
for i in range(4):
    print(randomize_smiles(Troglitazone))

CC1(COc2ccc(CC3C(=O)NC(=O)S3)cc2)Oc2c(C)c(C)c(O)c(C)c2CC1
c12c(c(C)c(O)c(C)c1C)CCC(C)(COc1ccc(CC3C(=O)NC(=O)S3)cc1)O2
C1(COc2ccc(CC3C(=O)NC(=O)S3)cc2)(C)CCc2c(C)c(O)c(C)c(C)c2O1
c1(C)c2c(c(C)c(C)c1O)OC(C)(COc1ccc(CC3SC(=O)NC3=O)cc1)CC2


In [8]:
for i in range(4):
    print(randomize_smiles(Thiothixene))

c1cc2c(cc1)Sc1c(cc(S(=O)(=O)N(C)C)cc1)/C2=C\CCN1CCN(C)CC1
c1cc2c(cc1)/C(=C/CCN1CCN(C)CC1)c1cc(S(=O)(N(C)C)=O)ccc1S2
C(N1CCN(C)CC1)C/C=C1/c2ccccc2Sc2c1cc(S(N(C)C)(=O)=O)cc2
c1c(S(=O)(=O)N(C)C)ccc2c1/C(=C\CCN1CCN(C)CC1)c1ccccc1S2


In [10]:
def canonicalize_SMILES(smiles):
    mols = [Chem.MolFromSmiles(s) for s in smiles]
    smiles_canonicalized = [Chem.MolToSmiles(x, isomericSmiles=False) for x in mols]
    return smiles_canonicalized

canonicalize_SMILES(["c1cc2c(cc1)Sc1c(cc(S(=O)(=O)N(C)C)cc1)/C2=C\CCN1CCN(C)CC1", 
                    "c1c(S(=O)(=O)N(C)C)ccc2c1/C(=C\CCN1CCN(C)CC1)c1ccccc1S2"])

['CN1CCN(CCC=C2c3ccccc3Sc3ccc(S(=O)(=O)N(C)C)cc32)CC1',
 'CN1CCN(CCC=C2c3ccccc3Sc3ccc(S(=O)(=O)N(C)C)cc32)CC1']