In [91]:
import gzip
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

In [92]:
path_sdf = os.path.join('..','data','01_raw','sdf_files')
filename_sdf = 'Compound_001000004_001000006.sdf.gz'

In [93]:
path_csv = os.path.join('..','data','01_raw')
filename_csv = 'Compound_001000004_001000006.csv'

In [94]:
sdf_file = os.path.join(path_sdf, filename_sdf)

In [95]:
try:
    with gzip.open(sdf_file, 'rb') as gz:
        supplier = Chem.ForwardSDMolSupplier(gz)
        data = []
        for i, mol in enumerate(supplier):
            if mol is None:
                print(f"Warning: Skipping invalid molecule in at index {i}")
                continue
            try:
                data.append({
                    "SMILES": Chem.MolToSmiles(mol),
                    "Molecular Weight": Descriptors.MolWt(mol),
                    "H-Bond Donors": Chem.Lipinski.NumHDonors(mol),
                    "H-Bond Acceptors": Chem.Lipinski.NumHAcceptors(mol),
                    "LogP": Descriptors.MolLogP(mol),
                })
            except Exception as e:
                print(f"Error processing molecule {i+1}: {e}")

        df = pd.DataFrame(data)
except Exception as e:
    print(f"Error processing file: {e}")

In [96]:
df.head()

Unnamed: 0,SMILES,Molecular Weight,H-Bond Donors,H-Bond Acceptors,LogP
0,Cc1ccc(S(=O)(=O)N(CC(=O)Nc2ccc3c(c2)OCO3)c2ccc...,452.532,1,5,4.17456
1,COc1cc(C=NNC(=O)c2cccc(C)c2)ccc1OCC(=O)Nc1ccccc1,417.465,2,5,3.78502
2,COc1ccc(N(CC(=O)NCc2ccco2)S(=O)(=O)c2ccc(C)cc2...,448.928,1,5,3.76172


In [97]:
df.to_csv(os.path.join(path_csv, filename_csv), index=False)