In [5]:
from rdkit import Chem, DataStructs, rdBase
from rdkit.Chem import (Draw, PandasTools, AllChem, rdDepictor, rdMolDescriptors,
                        rdFMCS, rdmolops, Descriptors)
from rdkit.Chem.Draw import IPythonConsole, rdMolDraw2D
from molvs import standardize_smiles
import pandas as pd
from tqdm import tqdm
import io
from PIL import Image
import os
rdDepictor.SetPreferCoordGen(True)


In [6]:
sample_data = pd.read_csv('Raw.csv',encoding='utf_8_sig')
sample_data

Unnamed: 0,Original_ID,SMILES,Value,Unit,Endpoint,Standardized_Value,Standardized_Endpoint,Assay,Description,Source,Publish_Date,No_symbol_Value
0,Ib,N1=CC2C(N[C@H]3C[C@H](C3)OC3C(=CC(=CN=3)F)[C@@...,11.13,10^-6 cm/s,Papp AB,-4.953504836,Log Papp AB,MDCK,Data of bidirectional permeability study on te...,EP4074715A1,2022.10.19,11.13
1,502,O=C1[C@@H](C(C)C)NC([C@@H](CC2=CC(=CC=C2)C(F)(...,5,10^-6 cm/s,Papp AB,-5.301029996,Log Papp AB,Caco-2,Caco-2 Permeability Assays for Representative ...,EP2054429B1,2013.11.06,5
2,1503,OC(C)[C@H]1C(=O)N[C@@H](C(=O)N[C@H](C)[C@H](CC...,12,10^6 cm/s,Papp BA,-4.920818754,Log Papp BA,Caco-2,Papp BA Without P-gp inhibitor,WO2011053821A1,2011.05.05,12
3,801,O=C1[C@@H](C)N(C([C@H](C2CCCCC2)NC[C@H]2CCC3C(...,0.672,,ER,0.672,ER,Caco-2,B to A/A to B,WO2008130464A1,2008.10.30,0.672
4,CHEMBL506515,COc1ccc2c(O[C@@H]3C[C@H]4C(=O)N[C@]5(C(=O)NS(=...,10,10^-6 cm/s,Papp AB,-5,Log Papp AB,Caco-2,Apparent permeability across apical to basolat...,10.1016/j.bmcl.2008.10.004,2008.10.01,10
...,...,...,...,...,...,...,...,...,...,...,...,...
127,CHEMBL5027915,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H...,440,nm/s,Papp,-4.356547324,Log Papp,OTHER,Permeability of compound in human HeLa cells b...,10.1021/acsmedchemlett.1c00438,2021.10.27,440
128,CHEMBL217374,CC[C@H](NCC1Cc2cccc(c2)CCCCc2cc(cc(N(C)S(C)(=O...,19,10^-6 cm/s,Papp,-4.721246399,Log Papp,OTHER,Apparent permeability in porcine LLC-PK1 cells,10.1021/jm060884i,2006.09.01,19
129,CHEMBL394225,CC1CCc2cccc(c2)C[C@@](C)(N)C(=O)OCc2cc(cc(N(C)...,16,10^-6 cm/s,Papp,-4.795880017,Log Papp,OTHER,Apparent permeability in human KBV1 cells over...,10.1016/j.bmcl.2007.08.040,2007.08.17,16
130,CHEMBL4542646,CN1CC/C=C/[C@H](O)[C@@H]2CC[C@H]2CN2C[C@@]3(CC...,2.1,10^-6 cm/s,Papp,-5.677780705,Log Papp,OTHER,Apparent permeability in pig LLC-PK1 cells by ...,10.1021/acs.jmedchem.9b01310,2019.11.18,2.1


In [7]:
def standardise(smiles):
    std_smiles = standardize_smiles(smiles)
    return standardize_smiles (smiles)

sample_data['Standardise_SMILES'] = sample_data['SMILES'].apply(standardise)
idx = sample_data.columns.get_loc('SMILES') + 1
# Remove the column and reinsert it to the desired location
standard_smiles_col = sample_data.pop('Standardise_SMILES')
sample_data.insert(idx, 'Standardise_SMILES', standard_smiles_col)
standard_smiles_to_id = {}

def assign_id(standard_smiles):
    if standard_smiles not in standard_smiles_to_id:
        standard_smiles_to_id[standard_smiles] = 'MC-' + str(len(standard_smiles_to_id) +1).zfill(4)
    return standard_smiles_to_id[standard_smiles]

# Assuming sample_data is your DataFrame and standard_smiles_to_id is your dictionary
sample_data['ID'] = sample_data['Standardise_SMILES'].apply(assign_id)


# Move the ID col to the first row
cols = sample_data.columns.tolist()
cols = [cols[-1]] + cols[:-1]  ## WARNING: DO NOT RUN THIS PART OF CODE MORE THAN ONCE!!
sample_data = sample_data[cols]
sample_data = sample_data.sort_values(by='ID')



In [8]:
smiles_list = sample_data['Standardise_SMILES']
ID = sample_data['ID']
mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]

In [9]:
def _get_macrocycle_ring_mol(mol, strip=False):
    Chem.RemoveStereochemistry(mol)
    Chem.Kekulize(mol, clearAromaticFlags=True)
    ri = mol.GetRingInfo()
    atoms_ring = max(ri.AtomRings(), key=len)
    if strip:
        macrocycle_smiles = AllChem.MolFragmentToSmiles(mol, atomsToUse=list(atoms_ring))
    else:
        query = Chem.MolFromSmarts('[!#1;!#7]~[R]*')
        matches = mol.GetSubstructMatches(query)
        atoms_to_use = list(atoms_ring) + [m[0] for m in matches]
        macrocycle_smiles = AllChem.MolFragmentToSmiles(mol, atomsToUse=atoms_to_use)
    macrocycle_mol = Chem.MolFromSmiles(macrocycle_smiles)
    mol_frags = rdmolops.GetMolFrags(macrocycle_mol, asMols=True)
    largest_mol = max(mol_frags, default=macrocycle_mol, key=lambda m: m.GetNumAtoms())
    return largest_mol


In [27]:
class process:
    def __init__(self, list_of_smiles):
        self.smiles = list_of_smiles
        self.standardized_smiles = [standardize_smiles(smi) for smi in list_of_smiles]
        self.mols = [Chem.MolFromSmiles(smi) for smi in self.standardized_smiles]

    def get_macrocycle_ring_size(self):
        ring_sizes = []
        for mol in tqdm(self.mols):
            macrocycle_mol = _get_macrocycle_ring_mol(mol, strip=True)
            ring_size = macrocycle_mol.GetNumBonds()
            ring_sizes.append(ring_size)
        return ring_sizes

    def get_macrocycle_ring_size_for_mol (self,mol):
        macrocycle_mol = _get_macrocycle_ring_mol(mol, strip=True)
        ring_size = macrocycle_mol.GetNumBonds()
        return ring_size


    def get_free_amide_count(self):
        free_amide_counts = []
        for mol in tqdm(self.mols):
            macrocycle_mol = _get_macrocycle_ring_mol(mol, strip=False)
            free_amide_smart = Chem.MolFromSmarts('[Nh][CX3]=[O]')
            free_amide_count = len(macrocycle_mol.GetSubstructMatches(free_amide_smart))
            free_amide_counts.append(free_amide_count)
        return free_amide_counts

    def get_free_amide_count_for_mol(self, mol):
        macrocycle_mol = _get_macrocycle_ring_mol(mol, strip=False)
        free_amide_smart = Chem.MolFromSmarts('[Nh][CX3]=[O]')
        return len(macrocycle_mol.GetSubstructMatches(free_amide_smart))

    def get_sub_amide_count(self):
        sub_amide_counts = []
        for mol in tqdm(self.mols):
            macrocycle_mol = _get_macrocycle_ring_mol(mol, strip=False)
            free_amide_count = self.get_free_amide_count_for_mol(mol)
            all_amide_smart = Chem.MolFromSmarts('[#6](=[#8])-[#7]-[#6]')
            all_amide_count = len(macrocycle_mol.GetSubstructMatches(all_amide_smart))
            sub_amide_count = (all_amide_count - free_amide_count) / 2
            sub_amide_counts.append(sub_amide_count)
        return sub_amide_counts

    def get_sub_amide_count_for_mol(self, mol):
        macrocycle_mol = _get_macrocycle_ring_mol(mol, strip=False)
        free_amide_count = self.get_free_amide_count_for_mol(mol)
        all_amide_smart = Chem.MolFromSmarts('[#6](=[#8])-[#7]-[#6]')
        all_amide_count = len(macrocycle_mol.GetSubstructMatches(all_amide_smart))
        sub_amide_count = (all_amide_count - free_amide_count) / 2
        return sub_amide_count

    def get_overall_amide_count(self):
        overall_amide_counts = []
        for mol in tqdm(self.mols):
            macrocycle_mol = _get_macrocycle_ring_mol(mol, strip=False)
            free_amide_count = self.get_free_amide_count_for_mol(mol)
            sub_amide_count = self.get_sub_amide_count_for_mol(mol)
            overall_amide_count = free_amide_count + sub_amide_count
            overall_amide_counts.append(overall_amide_count)
        return overall_amide_counts

    def get_macrocycle_core_smiles(self):
        core_smiles = []
        for mol in tqdm(self.mols):
            macrocycle_mol = _get_macrocycle_ring_mol(mol, strip=True)
            macrocycle_core_smiles = Chem.MolToSmiles(macrocycle_mol)
            core_smiles.append(macrocycle_core_smiles)
        return core_smiles

    def get_macrocycle_peripheral_smiles(self):
        peripheral_smiles = []
        for mol in tqdm(self.mols):
            macrocycle_mol = _get_macrocycle_ring_mol(mol, strip=False)
            macrocycle_peripheral_smiles = Chem.MolToSmiles(macrocycle_mol)
            peripheral_smiles.append(macrocycle_peripheral_smiles)
        return peripheral_smiles

    def get_macrocycle_free_amide_ratio (self):
        macrocycle_free_amide_ratios = []
        for mol in tqdm(self.mols):
            free_amide_count = self.get_free_amide_count_for_mol(mol)
            ring_size = self.get_macrocycle_ring_size_for_mol(mol)
            free_amide_ratio = free_amide_count*3/ring_size
            macrocycle_free_amide_ratios.append(free_amide_ratio)
        return macrocycle_free_amide_ratios

    def get_macrocycle_amide_ratio (self):
        macrocycle_amide_ratios = []
        for mol in tqdm(self.mols):
            macrocycle_mol = _get_macrocycle_ring_mol(mol, strip=False)
            free_amide_count = self.get_free_amide_count_for_mol(mol)
            sub_amide_count = self.get_sub_amide_count_for_mol(mol)
            overall_amide_count = free_amide_count + sub_amide_count
            ring_size = self.get_macrocycle_ring_size_for_mol(mol)
            amide_ratio = overall_amide_count*3/ring_size
            macrocycle_amide_ratios.append(amide_ratio)
        return macrocycle_amide_ratios

    def get_num_of_rings (self):
        num_rings = []
        for mol in tqdm(self.mols):
            num_ring = mol.GetRingInfo().NumRings()
            num_rings.append(num_ring)
        return num_rings

    def get_num_of_aromatic_rings(self):
        num_aromatic_rings = []
        for mol in tqdm(self.mols):
            num_aromatic_ring = len([ring for ring in mol.GetRingInfo().AtomRings()
                                if all(mol.GetAtomWithIdx(idx).GetIsAromatic() for idx in ring)])
            num_aromatic_rings.append(num_aromatic_ring)
        return num_aromatic_rings

    def get_cLogP(self):
        cLogP = []
        for mol in tqdm(self.mols):
            cLogP.append(Descriptors.MolLogP(mol))
        return cLogP

    def get_molecular_weight(self):
        molecular_weights = []
        for mol in tqdm(self.mols):
            molecular_weight = Descriptors.MolWt(mol)
            molecular_weights.append(molecular_weight)
        return molecular_weights

    def get_Num_H_Acceptors(self):
        Num_H_Acceptors = []
        for mol in tqdm(self.mols):
            Num_H_Acceptor = Descriptors.NumHAcceptors(mol)
            Num_H_Acceptors.append(Num_H_Acceptor)
        return Num_H_Acceptors

    def get_Num_H_donors(self):
        Num_H_donors = []
        for mol in tqdm(self.mols):
            Num_H_donor = Descriptors.NumHDonors(mol)
            Num_H_donors.append(Num_H_donor)
        return Num_H_donors

    def get_Num_Heavy_Atoms(self):
        Num_Heavy_Atoms = []
        for mol in tqdm(self.mols):
            Num_Heavy_Atom = Descriptors.HeavyAtomCount(mol)
            Num_Heavy_Atoms.append(Num_Heavy_Atom)
        return Num_Heavy_Atoms

    def get_Num_Carbon_Atoms(self):
        Num_Carbon_Atoms = []
        for mol in tqdm(self.mols):
            Num_Carbon_Atom = sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == 'C')
            Num_Carbon_Atoms.append(Num_Carbon_Atom)
        return Num_Carbon_Atoms

    def get_fraction_sp3_carbons(self):
        fraction_sp3_carbons = []
        sp3_carbons = 0
        total_carbons = 0
        for mol in tqdm(self.mols):
            for atom in mol.GetAtoms():
              if atom.GetSymbol() == 'C':  # Check if the atom is a carbon atom
                total_carbons += 1
              if atom.GetHybridization() == Chem.HybridizationType.SP3:
                sp3_carbons += 1
              fraction_sp3_carbon = sp3_carbons / total_carbons
            fraction_sp3_carbons.append(fraction_sp3_carbon)
        return fraction_sp3_carbons

    def get_TPSA(self):
        TPSA = []
        for mol in tqdm(self.mols):
            tpsa = Descriptors.TPSA(mol)
            TPSA.append(tpsa)
        return TPSA

    def get_Num_Rotatable_Bonds(self):
        Num_Rotatable_Bonds = []
        for mol in tqdm(self.mols):
            Num_Rotatable_Bond = Descriptors.NumRotatableBonds(mol)
            Num_Rotatable_Bonds.append(Num_Rotatable_Bond)

        return Num_Rotatable_Bonds

    def get_Num_Charged_Atoms(self):
        Num_Charged_Atoms = []
        for mol in tqdm(self.mols):
            Num_Charged_Atom = Descriptors.NumValenceElectrons(mol)
            Num_Charged_Atoms.append(Num_Charged_Atom)

        return Num_Charged_Atoms
    def get_Net_Charge(self):
        Net_Charge = []
        for mol in tqdm(self.mols):
            net_charge = rdmolops.GetFormalCharge(mol)
            Net_Charge.append(net_charge)
        return Net_Charge

    def get_Kier_index(self):
        Kier_index = []
        for mol in tqdm(self.mols):
            kier_index = Descriptors.Kappa1(mol)*Descriptors.Kappa2(mol)/Descriptors.HeavyAtomCount(mol)
            Kier_index.append(kier_index)

        return Kier_index

    def get_InchiKey (self):
        InchiKey = []
        for mol in tqdm(self.mols):
            inchi_key = Chem.InchiToInchiKey(Chem.MolToInchi(mol))
            InchiKey.append(inchi_key)
        return InchiKey


    def result(self):
        Macrocycle_Ring_Sizes = self.get_macrocycle_ring_size()
        Macrocycle_Free_Amide_Counts = self.get_free_amide_count()
        Macrocycle_Substituted_Amide_Count = self.get_sub_amide_count()
        Macrocycle_Overall_Amide_Count = self.get_overall_amide_count()
        Macrocycle_Core_Smiles = self.get_macrocycle_core_smiles()
        Macrocycle_Peripheral_Smiles = self.get_macrocycle_peripheral_smiles()
        Macrocycle_free_amide_ratios = self.get_macrocycle_free_amide_ratio()
        Macrocycle_amide_ratios = self.get_macrocycle_amide_ratio()
        Num_Rings = self.get_num_of_rings()
        Num_Aromatic_Rings = self.get_num_of_aromatic_rings()
        cLogP = self.get_cLogP()
        Molecular_Weight = self.get_molecular_weight()
        Num_H_Acceptors = self.get_Num_H_Acceptors()
        Num_H_donors = self.get_Num_H_donors()
        Num_Heavy_Atoms = self.get_Num_Heavy_Atoms()
        Num_Carbon_Atoms = self.get_Num_Carbon_Atoms()
        fraction_sp3_carbons = self.get_fraction_sp3_carbons()
        TPSA = self.get_TPSA()
        Num_Rotatable_Bonds = self.get_Num_Rotatable_Bonds()
        Num_Charged_Atoms = self.get_Num_Charged_Atoms()
        Net_Charge = self.get_Net_Charge()
        Kier_index = self.get_Kier_index()
        InchiKey = self.get_InchiKey()



        result_df = pd.DataFrame({
            "Macrocycle_Ring_Size": Macrocycle_Ring_Sizes,
            "Macrocycle_Free_Amide_Count": Macrocycle_Free_Amide_Counts,
            "Macrocycle_Substituted_Amide_Count": Macrocycle_Substituted_Amide_Count,
            "Macrocycle_Overall_Amide_Count": Macrocycle_Overall_Amide_Count,
            "Macrocycle_Core_Smiles": Macrocycle_Core_Smiles,
            "Macrocycle_Peripheral_Smiles": Macrocycle_Peripheral_Smiles,
            "Macrocycle_free_amide_ratios":Macrocycle_free_amide_ratios,
            "Macrocycle_amide_ratios":Macrocycle_amide_ratios,
            "Num_Rings":Num_Rings,
            "Num_Aromatic_Rings":Num_Aromatic_Rings,
            "cLogP":cLogP,
            "Molecular_Weight":Molecular_Weight,
            "Num_H_Acceptors":Num_H_Acceptors,
            "Num_H_donors":Num_H_donors,
            "Num_Heavy_Atoms":Num_Heavy_Atoms,
            "Num_Carbon_Atoms":Num_Carbon_Atoms,
            "Fraction_SP3_Carbons":fraction_sp3_carbons,
            "TPSA":TPSA,
            "Num_Rotatable_Bonds":Num_Rotatable_Bonds,
            "Num_Charged_Atoms":Num_Charged_Atoms,
            "Net_Charge":Net_Charge,
            "Kier_index":Kier_index,
            "InchiKey":InchiKey})
        return result_df

In [28]:
rdkit_featurizer = process(smiles_list)
rdkit_features = rdkit_featurizer.result()
rdkit_features

100%|██████████| 132/132 [00:00<00:00, 1665.64it/s]
100%|██████████| 132/132 [00:00<00:00, 748.26it/s]
100%|██████████| 132/132 [00:00<00:00, 360.62it/s]
100%|██████████| 132/132 [00:00<00:00, 179.69it/s]
100%|██████████| 132/132 [00:00<00:00, 1420.10it/s]
100%|██████████| 132/132 [00:00<00:00, 651.72it/s]
100%|██████████| 132/132 [00:00<00:00, 444.95it/s]
100%|██████████| 132/132 [00:00<00:00, 163.51it/s]
100%|██████████| 132/132 [00:00<00:00, 274899.77it/s]
100%|██████████| 132/132 [00:00<00:00, 7375.48it/s]
100%|██████████| 132/132 [00:00<00:00, 869.70it/s]
100%|██████████| 132/132 [00:00<00:00, 60514.61it/s]
100%|██████████| 132/132 [00:00<00:00, 8636.45it/s]
100%|██████████| 132/132 [00:00<00:00, 34450.14it/s]
100%|██████████| 132/132 [00:00<00:00, 96320.13it/s]
100%|██████████| 132/132 [00:00<00:00, 8550.29it/s]
100%|██████████| 132/132 [00:00<00:00, 6314.20it/s]
100%|██████████| 132/132 [00:00<00:00, 35329.47it/s]
100%|██████████| 132/132 [00:00<00:00, 5140.60it/s]
100%|████████

Unnamed: 0,Macrocycle_Ring_Size,Macrocycle_Free_Amide_Count,Macrocycle_Substituted_Amide_Count,Macrocycle_Overall_Amide_Count,Macrocycle_Core_Smiles,Macrocycle_Peripheral_Smiles,Macrocycle_free_amide_ratios,Macrocycle_amide_ratios,Num_Rings,Num_Aromatic_Rings,...,Num_H_donors,Num_Heavy_Atoms,Num_Carbon_Atoms,Fraction_SP3_Carbons,TPSA,Num_Rotatable_Bonds,Num_Charged_Atoms,Net_Charge,Kier_index,InchiKey
0,14,1,0.0,1.0,C1=CN=CNCCCOCCCNC1,C=CC1=NC=C(C)C(=O)NC2CC(C2)OCC(=CC(=C)F)C(C)N1,0.214286,0.214286,6,0,...,2,28,18,0.444444,90.68,0,144,0,3.676688,JXLZJPYLDDSEQF-UHFFFAOYSA-N
1,18,3,0.0,3.0,C1CCNCCNCCNCCNCCOCC1,CC1NC(=O)C(C)NC(=O)C(Cc2ccccc2)NCCOc2ccccc2CCC...,0.500000,0.500000,3,0,...,4,43,31,0.591837,111.80,5,236,0,12.377831,XLGYUJSXIXSLHY-UHFFFAOYSA-N
2,18,2,1.0,3.0,C1CCNCCNCCNCCNCCOCC1,CC1NCCOc2ccccc2CC(O)C(C)NC(=O)C(Cc2ccccc2)NC(=...,0.333333,0.500000,3,0,...,5,42,31,0.612500,140.23,4,230,0,11.838962,IOEZDPQDMJIGAP-UHFFFAOYSA-N
3,18,2,1.0,3.0,C1CCNCCNCCNCCNCCOCC1,CCCCC1NCC2CCc3cccc(c3O2)CCCNC(=O)C(C)NC(=O)C(C...,0.333333,0.500000,4,0,...,3,39,31,0.648649,99.77,3,216,0,10.275165,VZFTWTOTRKHOGV-UHFFFAOYSA-N
4,15,3,0.0,3.0,C1=CCCNCCNCNCCCCC1,CCC=C(OCCC1NC(=O)NCCCCCC=CC2CC2(C)NC1=O)c1ccc(...,0.600000,0.600000,7,0,...,3,51,36,0.625850,167.86,8,272,0,9.694833,FMBMILUSWDVMGF-UHFFFAOYSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,36,10,1.0,11.0,C1CNCCNCCNCCNCCNCCSCCNCCNCCNCCNCCNCCN1,C=C(CC1NC(=O)C(C)NC(=O)C(C)NC(=O)C(C)NC(=O)C(C...,0.833333,0.916667,8,0,...,17,107,74,0.610443,505.10,18,572,0,28.798764,ZRBQWGXWYGDKMX-UHFFFAOYSA-N
128,14,1,0.0,1.0,C1=CCCNCC=CCCCCCC1,CC1Cc2cccc(c2)CCCCc2cccc(c2)C(=O)N1,0.214286,0.214286,3,0,...,3,39,30,0.610376,107.61,9,214,0,10.946379,ABGNOCQONIAVFH-UHFFFAOYSA-N
129,16,0,0.0,0.0,C1=CCCCCOCCC=CCCCCC1,CC1CCc2cccc(c2)COC(=O)C(C)Cc2cccc(c2)CC1,0.000000,0.000000,3,0,...,1,32,25,0.610108,89.70,2,174,0,7.301108,LLKXCBKWDZOBGU-UHFFFAOYSA-N
130,16,0,1.0,1.0,C1=CCCNCCCCC=CNCCCC1,CN1CCC=CC(O)C2CCC2CN2CC3(CCCc4ccccc43)COc3ccc(...,0.000000,0.187500,6,0,...,3,42,33,0.610080,110.54,1,224,0,7.746341,LHIKAJSPOXPDMX-UHFFFAOYSA-N


In [29]:
result_df = pd.concat([sample_data, rdkit_features], axis =1)
result_df

Unnamed: 0,ID,Original_ID,SMILES,Standardise_SMILES,Value,Unit,Endpoint,Standardized_Value,Standardized_Endpoint,Assay,...,Num_H_donors,Num_Heavy_Atoms,Num_Carbon_Atoms,Fraction_SP3_Carbons,TPSA,Num_Rotatable_Bonds,Num_Charged_Atoms,Net_Charge,Kier_index,InchiKey
0,MC-0001,Ib,N1=CC2C(N[C@H]3C[C@H](C3)OC3C(=CC(=CN=3)F)[C@@...,C[C@H]1Nc2nc3c(cnn3cc2F)C(=O)N[C@H]2C[C@H](C2)...,11.13,10^-6 cm/s,Papp AB,-4.953504836,Log Papp AB,MDCK,...,2,28,18,0.444444,90.68,0,144,0,3.676688,JXLZJPYLDDSEQF-UHFFFAOYSA-N
1,MC-0002,502,O=C1[C@@H](C(C)C)NC([C@@H](CC2=CC(=CC=C2)C(F)(...,CC(C)[C@H]1NC(=O)[C@@H](Cc2cccc(C(F)(F)F)c2)NC...,5,10^-6 cm/s,Papp AB,-5.301029996,Log Papp AB,Caco-2,...,4,43,31,0.591837,111.80,5,236,0,12.377831,XLGYUJSXIXSLHY-UHFFFAOYSA-N
2,MC-0003,1503,OC(C)[C@H]1C(=O)N[C@@H](C(=O)N[C@H](C)[C@H](CC...,CC(C)[C@@H]1NCCOc2ccccc2C[C@H](O)[C@@H](C)NC(=...,12,10^6 cm/s,Papp BA,-4.920818754,Log Papp BA,Caco-2,...,5,42,31,0.612500,140.23,4,230,0,11.838962,IOEZDPQDMJIGAP-UHFFFAOYSA-N
3,MC-0004,801,O=C1[C@@H](C)N(C([C@H](C2CCCCC2)NC[C@H]2CCC3C(...,CC(C)C[C@H]1NC(=O)[C@@H](C)N(C)C(=O)[C@H](C2CC...,0.672,,ER,0.672,ER,Caco-2,...,3,39,31,0.648649,99.77,3,216,0,10.275165,VZFTWTOTRKHOGV-UHFFFAOYSA-N
4,MC-0005,CHEMBL506515,COc1ccc2c(O[C@@H]3C[C@H]4C(=O)N[C@]5(C(=O)NS(=...,COc1ccc2c(O[C@@H]3C[C@H]4C(=O)N[C@]5(C(=O)NS(=...,10,10^-6 cm/s,Papp AB,-5,Log Papp AB,Caco-2,...,3,51,36,0.625850,167.86,8,272,0,9.694833,FMBMILUSWDVMGF-UHFFFAOYSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,MC-0124,CHEMBL5027915,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H...,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H...,440,nm/s,Papp,-4.356547324,Log Papp,OTHER,...,17,107,74,0.610443,505.10,18,572,0,28.798764,ZRBQWGXWYGDKMX-UHFFFAOYSA-N
128,MC-0125,CHEMBL217374,CC[C@H](NCC1Cc2cccc(c2)CCCCc2cc(cc(N(C)S(C)(=O...,CC[C@H](NCC1Cc2cccc(c2)CCCCc2cc(cc(N(C)S(C)(=O...,19,10^-6 cm/s,Papp,-4.721246399,Log Papp,OTHER,...,3,39,30,0.610376,107.61,9,214,0,10.946379,ABGNOCQONIAVFH-UHFFFAOYSA-N
129,MC-0126,CHEMBL394225,CC1CCc2cccc(c2)C[C@@](C)(N)C(=O)OCc2cc(cc(N(C)...,CC1CCc2cccc(c2)C[C@@](C)(N)C(=O)OCc2cc(cc(N(C)...,16,10^-6 cm/s,Papp,-4.795880017,Log Papp,OTHER,...,1,32,25,0.610108,89.70,2,174,0,7.301108,LLKXCBKWDZOBGU-UHFFFAOYSA-N
130,MC-0127,CHEMBL4542646,CN1CC/C=C/[C@H](O)[C@@H]2CC[C@H]2CN2C[C@@]3(CC...,CN1CC/C=C/[C@H](O)[C@@H]2CC[C@H]2CN2C[C@@]3(CC...,2.1,10^-6 cm/s,Papp,-5.677780705,Log Papp,OTHER,...,3,42,33,0.610080,110.54,1,224,0,7.746341,LHIKAJSPOXPDMX-UHFFFAOYSA-N


In [None]:

# Define directory name
dir_name = '/Desktop/Overall_images'

# Check if the directory exists, if not create it
if not os.path.exists(dir_name):
    os.makedirs(dir_name)

# Assuming 'sample_data' is defined
# Drop duplicates
sample_data.drop_duplicates(subset='Standardise_SMILES', keep="first", inplace=True)

# Create Images and save in target folder
for index, row in sample_data.iterrows():
    molecule = row['Standardise_SMILES']
    if molecule is not None:
        drawer = rdMolDraw2D.MolDraw2DCairo(600, 600)
        drawer.drawOptions().clearBackground = False
        drawer.drawOptions().addStereoAnnotation = False
        drawer.DrawMolecule(Chem.MolFromSmiles(molecule))
        drawer.FinishDrawing()
        img_data = drawer.GetDrawingText()  # Get image data as bytes
        file_path = os.path.join(dir_name, f"{row['ID']}.png")
        with open(file_path, 'wb') as f:
            f.write(img_data)  # Write the bytes to a file
    else:
        print(f"SMILES string at index {index} could not be converted into a molecule.")


In [None]:

dir_name_2 = '/Desktop/Overall_sdf'

# Check if the directory exists, if not create it
if not os.path.exists(dir_name_2):
    os.makedirs(dir_name_2)

# Create Images and save in target folder
for index, row in sample_data.iterrows():
    molecule = row['Standardise_SMILES']
    if molecule:
        mol = Chem.MolFromSmiles(molecule)  # Assuming you need to convert SMILES to a molecule
        if mol is not None:
            file_path_2 = os.path.join(dir_name_2, f"{row['ID']}.sdf")
            Chem.MolToMolFile(mol, file_path_2)  # Save directly to file
