In [1]:
import os
import random
import sys
import gzip
from itertools import islice
import numpy as np
import pandas as pd
import rdkit
from tqdm import tqdm

from pathlib import Path

from rdkit.Chem.Descriptors import NumRadicalElectrons

from sklearn.model_selection import train_test_split

# Ignore rdkit warnings: https://github.com/rdkit/rdkit/issues/2683
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [2]:
def iter_mols():
    with gzip.open('20200415_radical_database.sdf.gz') as sdffile:
        mol_suppl = rdkit.Chem.ForwardSDMolSupplier(sdffile, removeHs=False)
        for mol in tqdm(mol_suppl, total=289639):
            props = mol.GetPropsAsDict()
            props['mol'] = mol
            for item in ['AtomCharges', 'AtomSpins', 'VibFreqs', 'IRIntensity', 'RotConstants']:
                if item in props:
                    try:
                        props[item] = eval(props[item])
                    except NameError:
                        pass
                        # warn("Error with molecule {} property {}".format(props['SMILES'], item))
                    
            props['type'] = 'molecule' if NumRadicalElectrons(mol) == 0 else 'fragment'
            props['Name'] = mol.GetProp('_Name')
            yield pd.Series(props)

In [3]:
# 1. Load data
df = pd.DataFrame(iter_mols())
df["log(-SCFEnergy)"] = np.log(-df["SCFEnergy"])

  2%|█▍                                                                         | 5492/289639 [00:03<03:19, 1421.68it/s]

KeyboardInterrupt



In [None]:
# 2. Sample data
df_sample = df.loc[random.sample(list(df.index), k=1000)]
df_train, df_test = train_test_split(df_sample, test_size=0.2, random_state=42)

# # 3. Save training and test set
d_folder = Path('qc_data')
os.makedirs(d_folder, exist_ok=True)

with open(os.path.join(d_folder, "train.csv"), "w") as f:
    for smi, y in zip(df_train["SMILES"], df_train["log(-SCFEnergy)"]):
        f.write(f"{smi},{y}\n")
        
with open(os.path.join(d_folder, "test.csv"), "w") as f:
    for smi, y in zip(df_test["SMILES"], df_test["log(-SCFEnergy)"]):
        f.write(f"{smi},{y}\n")