In [1]:
import pandas as pd
import numpy as np
from utils.grid2 import BuildGridCenters
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm
import h5py
from utils.xtb_density import interplot_ecloud

2024-04-07 10:47:32,758 - rdkit - INFO - Enabling RDKit 2022.09.4 jupyter extensions


In [2]:
def get_geom(mol, mmff=False):
    mol_ = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol_)
    if mmff:
        AllChem.MMFFOptimizeMolecule(mol_)
    mol_ = Chem.RemoveHs(mol_)
    return mol_

def get_center(mol):
    return mol.GetConformer().GetPositions().mean(axis=0)

def protocol(mode=32):
    size = mode
    N = [size, size, size]
    if mode == 32:
        resolution = 0.5
        llc = (np.zeros(3) - float(size * resolution / 2)) + resolution / 2
        expanded_pcenters = BuildGridCenters(llc, N, resolution)
    elif mode == 64:
        resolution = 0.2
        llc = (np.zeros(3) - float(size * resolution / 2)) + resolution / 2
        expanded_pcenters = BuildGridCenters(llc, N, resolution)
    
    return {'expanded_pcenters':expanded_pcenters, 'N':N}

def get_ligecloud(mol,calculater, protocol, add_noise=True):
    stand_grid = protocol['expanded_pcenters']
    N = protocol['N']
    mol_center = mol_center = get_center(mol) 
    lig_grids = stand_grid + mol_center 
    if add_noise:
        lig_grids += np.random.randn(3).astype(np.float32)
    lig_ecloud = calculater.calculate(mol)
    lig_density = interplot_ecloud(lig_ecloud, lig_grids.transpose(3, 0, 1, 2)).reshape(N)
    return lig_density

In [4]:
mols_data = pd.read_csv('./data/moses2.csv')

In [5]:
smiles = mols_data.iloc[0].SMILES
mol = Chem.MolFromSmiles(smiles)
mol_ = Chem.AddHs(mol)
AllChem.EmbedMolecule(mol_)
AllChem.MMFFOptimizeMolecule(mol_)
mol = Chem.RemoveHs(mol_)

In [7]:
from utils.xtb_density import CDCalculator, interplot_ecloud
calculater = CDCalculator(xtb_command='/home/haotian/Molecule_Generation/MG/ECloudGen_old/xtb-bleed/bin/xtb')

In [8]:
lig_density = get_ligecloud(mol,calculater, protocol(32))

In [9]:
lig_density.shape

(32, 32, 32)

In [14]:
condition_list = []
ecloud_list = []
smiles_list = []

prot = protocol(32)
for i in tqdm(range(10)):
    data = mols_data.iloc[i]
    if data.SPLIT == 'train':
        smiles = data.SMILES
        condition = [data.qed, data.logp, data.TPSA]
        mol = Chem.MolFromSmiles(smiles)
        mol = get_geom(mol,mmff=False)
        lig_density = get_ligecloud(mol,calculater, prot).astype(np.float32)
        np.save(f'./data/ecloud/{i}.npy',lig_density)
        ecloud_list.append(lig_density)
        condition_list.append(condition)
        smiles_list.append(smiles)

100%|██████████| 10/10 [00:01<00:00,  9.35it/s]


In [25]:
from data.dataset import create_h5

In [26]:
create_h5('./mol_data/ecloud.h5', './mol_data/ecloud', './mol_data/moses2.csv')

100%|██████████| 854/854 [00:01<00:00, 643.33it/s]

Creation Done.





In [108]:
ecloud_root = './mol_data/ecloud'
save_file = './mol_data/ecloud.h5'
files = os.listdir(ecloud_root)
files = sorted(files, key=lambda x: int(x.split('.')[0]))
num_files = len(files)
smiles = []

In [None]:
import numpy as np
import h5py

# Example data
num_files = 10  # just for this example
voxel_data = np.random.rand(num_files, 32, 32, 32).astype(np.float16)
string_data = [f"string_{i}" for i in range(num_files)]

save_file = "data.h5"

with h5py.File(save_file, 'w') as hf:
    # Saving voxel data
    dset_voxel = hf.create_dataset(
        name='voxel_data',
        shape=(num_files, 32, 32, 32),
        dtype=np.float16)
    dset_voxel[:] = voxel_data

    # Saving corresponding string data
    # Convert string data to fixed-size byte strings first
    byte_strings = [np.string_(s) for s in string_data]
    dset_strings = hf.create_dataset(
        name='string_data',
        shape=(num_files,),
        dtype=h5py.string_dtype(encoding='utf-8'))
    dset_strings[:] = byte_strings


In [124]:
with h5py.File(save_file, 'w') as hf:

    hf.create_dataset(
        name='eclouds',
        shape=(num_files, 32, 32, 32),
        dtype=np.float16)

    hf.create_dataset(
        name='smiles',
        shape=(num_files,),
        dtype=h5py.string_dtype(encoding='utf-8'))

    hf.create_dataset(
        name='conditions',
        shape=(num_files, 3),
        dtype=np.float16)

    for i in tqdm(range(num_files)):
        file_name = os.path.join(ecloud_root, files[i])
        mol_id = int(files[i][:-4])
        ecloud = np.load(file_name)
        hf['eclouds'][i] = ecloud
        data = mols_data.iloc[mol_id]
        smiles = data.SMILES
        condition = [data.qed, data.logp, data.TPSA]
        hf['smiles'][i] = smiles
        hf['conditions'][i] = condition

100%|██████████| 83/83 [00:00<00:00, 637.77it/s]


In [None]:
with h5py.File(save_file, 'w') as hf:

    hf.create_dataset(
        name='eclouds',
        shape=(num_files, 64, 64, 64),
        dtype=np.float16)

    for i in tqdm(range(num_files)):
        file_name = os.path.join(ecloud_root, str(i) + '.npy')
        ecloud = np.load(file_name)
        hf['eclouds'][i] = ecloud
    

In [104]:
import numpy as np
import h5py

# Example data
num_files = 10  # just for this example
voxel_data = np.random.rand(num_files, 32, 32, 32).astype(np.float16)
string_data = [f"string_{i}" for i in range(num_files)]

save_file = "data.h5"

with h5py.File(save_file, 'w') as hf:
    # Saving voxel data
    dset_voxel = hf.create_dataset(
        name='voxel_data',
        shape=(num_files, 32, 32, 32),
        dtype=np.float16)
    dset_voxel[:] = voxel_data

    # Saving corresponding string data
    # Convert string data to fixed-size byte strings first
    byte_strings = [np.string_(s) for s in string_data]
    dset_strings = hf.create_dataset(
        name='string_data',
        shape=(num_files,),
        dtype=h5py.string_dtype(encoding='utf-8'))
    dset_strings[:] = byte_strings


In [105]:
import h5py

# The file where data was saved
save_file = "data.h5"

with h5py.File(save_file, 'r') as hf:
    # Load voxel data
    voxel_data_loaded = hf['voxel_data'][:]
    
    # Load and decode string data
    byte_strings_loaded = hf['string_data'][:]
    string_data_loaded = [bs.decode('utf-8') for bs in byte_strings_loaded]

print(voxel_data_loaded.shape)  # should print (num_files, 32, 32, 32


(10, 32, 32, 32)
