In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import Draw
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
from tqdm import tqdm
from joblib import Parallel, delayed
from rdkit.Chem import AllChem

In [None]:
# read original dataset
data_fia49k = pd.read_csv(r"..\data\FIA53k.csv", low_memory=False)

In [None]:
# take only every 4th entry to save time
data = data_fia49k["la_smiles"].values[::4]

In [None]:
# generate mol objects from smiles
mols = []
formulars = []

for smiles in tqdm(data):
    mol = Chem.MolFromSmiles(smiles)
    mols.append(mol)
    formulars.append(CalcMolFormula(mol))

In [None]:
Draw.MolToImage(mols[0])

In [None]:
# calculate the mol properties in paralell
descrs = Parallel(n_jobs=-1)(
    delayed(Descriptors.CalcMolDescriptors)(mol) for mol in tqdm(mols)
)

# filter interesting parameters
interesting_descriptors = [
    "MolWt",
    "ExactMolWt",
    "NumValenceElectrons",
    "HeavyAtomCount",
    "NHOHCount",
    "NOCount",
]
df = pd.DataFrame(descrs)[interesting_descriptors]
df["Formular"] = formulars
df["smiles"] = data
df.insert(0, "Formular", df.pop("Formular"))
df.insert(0, "smiles", df.pop("smiles"))

In [None]:
# data selection
# choose which property to filter and how many samples are wanted
property_to_select = "MolWt"
n_samples = 15

# create equially spaced indicies to use in sorted df
idx = np.round(np.linspace(1, len(df) - 1, n_samples + 1)).astype(int)


df_sorted = df.sort_values(property_to_select)
df_sorted["multiplicity"] = 1
df_sorted["charge"] = 0
sampled_values = df_sorted.iloc[idx]
print(len(sampled_values))

In [None]:
sampled_values

In [None]:
Chem.Draw.MolsToGridImage(
    [Chem.MolFromSmiles(smiles) for smiles in sampled_values["smiles"].values]
)

In [None]:
Chem.Draw.MolsToGridImage(
    [Chem.MolFromSmiles(smiles) for smiles in sampled_values["smiles"].values]
)

In [None]:
# sampled_values.to_csv("example_molecules.csv")

In [None]:
df["MolWt"].hist()

In [None]:
sampled_values

In [None]:
mol_list = [Chem.MolFromSmiles(smiles) for smiles in sampled_values["smiles"].values]

In [None]:
# mols_3d = [Chem.AddHs(Chem.MolFromSmiles(smiles)) for smiles in sampled_values["smiles"].values]

# for mol in mols_3d:
#     AllChem.EmbedMolecule(mol)

# import nglview as nv

# # Visualize the first molecule
# view = nv.show_rdkit(mols_3d[1])
# view

In [None]:
benchmark_csv_table = pd.DataFrame()


import pathlib

file_path = pathlib.Path("./benchmark/xyz_files")
file_path.mkdir(parents=True, exist_ok=True)
file_names = []
formulas = []
for i, mol in enumerate(mol_list):
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol)
    AllChem.MMFFOptimizeMolecule(mol)
    formula = CalcMolFormula(mol)
    formulas.append(formula)
    file_name = file_path / f"{formula}.xyz"
    file_names.append(file_name.resolve())
    # Write to XYZ file
    with open(file_name, "w") as f:
        f.write(Chem.MolToXYZBlock(mol))


benchmark_csv_table["path"] = file_names
benchmark_csv_table["key"] = formulas
benchmark_csv_table["multiplicity"] = 1
benchmark_csv_table["charge"] = 0

In [None]:
benchmark_csv_table.to_csv(file_path / "benchmark_molecules.csv", index=False)