In [None]:
import h5py
import pandas as pd
import numpy as np
import os, re
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import RDLogger
import openbabel as ob
from openbabel import pybel as pb
from multiprocessing import Pool
RDLogger.DisableLog('rdApp.*') #hiding the warning messages
current_path = os.path.abspath('.')

In [None]:
# To download DES370K database
os.system('wget -c https://springernature.figshare.com/ndownloader/files/24032198')
os.system('mv 24032198 DES370K.csv')
# os.system('mkdir frcmod GAFF_ac2 sdf')    
os.system('mkdir tmp tmp2')

In [None]:
# Use .csv file to generate .h5 file
CSVFile = pd.read_csv('DES370K.csv')
smiles0 = CSVFile['smiles0']
smiles1 = CSVFile['smiles1']
charge0 = CSVFile['charge0']
charge1 = CSVFile['charge1']
natoms0 = CSVFile['natoms0']
natoms1 = CSVFile['natoms1']
group_id = CSVFile['group_id']
k_index = CSVFile['k_index']
energy = CSVFile['cbs_CCSD(T)_all']
xyz = CSVFile['xyz']
elements = CSVFile['elements']

dt = h5py.special_dtype(vlen=str)
hdfFile = h5py.File('DES370K.h5', 'w')
current_id = group_id[0]
current_index = []
current_energy = []
position = []
for i in range(len(CSVFile)):
    if current_id != group_id[i]:
        key = hdfFile.create_group(str(current_id))
        key.create_dataset('smiles0', shape=(1,), data=smiles0[i-1], dtype=dt)
        key.create_dataset('smiles1', shape=(1,), data=smiles1[i-1], dtype=dt)
        key.create_dataset('charge0', data=np.array([charge0[i-1]]))
        key.create_dataset('charge1', data=np.array([charge1[i-1]]))
        key.create_dataset('natoms0', data=np.array([natoms0[i-1]]))
        key.create_dataset('natoms1', data=np.array([natoms1[i-1]]))
        key.create_dataset('k_index', data=np.array(current_index))
        key.create_dataset('cbs_CCSD(T)_all', data=np.array(current_energy))
        key.create_dataset('xyz', data=np.array(position).astype(float))
        key.create_dataset('elements', shape=(natoms0[i-1] + natoms1[i-1],), data=re.split(' ', elements[i-1]), dtype=dt)
        
        current_index = [k_index[i]]
        current_energy = [energy[i]]
        position = [np.array(re.split(' ', xyz[i])).reshape(-1,3)]
        current_id = group_id[i]
    else:
        current_index.append(k_index[i])
        current_energy.append(energy[i])
        position.append(np.array(re.split(' ', xyz[i])).reshape(-1,3))
    
    if i == (len(CSVFile) - 1):
        key = hdfFile.create_group(str(current_id))
        key.create_dataset('smiles0', shape=(1,), data=smiles0[i-1], dtype=dt)
        key.create_dataset('smiles1', shape=(1,), data=smiles1[i-1], dtype=dt)
        key.create_dataset('charge0', data=np.array([charge0[i-1]]))
        key.create_dataset('charge1', data=np.array([charge1[i-1]]))
        key.create_dataset('natoms0', data=np.array([natoms0[i-1]]))
        key.create_dataset('natoms1', data=np.array([natoms1[i-1]]))
        key.create_dataset('k_index', data=np.array(current_index))
        key.create_dataset('cbs_CCSD(T)_all', data=np.array(current_energy))
        key.create_dataset('xyz', data=np.array(position).astype(float))
        key.create_dataset('elements', shape=(natoms0[i-1] + natoms1[i-1],), data=re.split(' ', elements[i-1]), dtype=dt)
hdfFile.close()

In [None]:
f = h5py.File('DES370K.h5','r')
all_key = []
for key in f.keys():
    smiles0 = f[key]['smiles0'][0].decode()
    smiles1 = f[key]['smiles1'][0].decode()
    if 'Li' in smiles0 or 'Na' in smiles0 or 'Mg' in smiles0 or 'K' in smiles0 or 'Ca' in smiles0 or 'Ar' in smiles0 or 'He' in smiles0 or 'Kr' in smiles0 or 'Ne' in smiles0 or 'Xe' in smiles0:
        continue
    if 'Li' in smiles1 or 'Na' in smiles1 or 'Mg' in smiles1 or 'K' in smiles1 or 'Ca' in smiles1 or 'Ar' in smiles1 or 'He' in smiles1 or 'Kr' in smiles1 or 'Ne' in smiles1 or 'Xe' in smiles1:
        continue
    all_key.append(key)
print(len(all_key), ' molecules')


In [None]:
# generate .xyz file
os.chdir(current_path + '/xyz')
os.system('rm *.xyz')
stored_smiles = []
charge = {}
for key in all_key:
    smiles0 = f[key]['smiles0'][0].decode()
    smiles1 = f[key]['smiles1'][0].decode()
    smiles0 = smiles0.replace("(", "L")
    smiles0 = smiles0.replace(")", "R")
    smiles0 = smiles0.replace("[", "M")
    smiles0 = smiles0.replace("]", "Q")
    smiles1 = smiles1.replace("(", "L")
    smiles1 = smiles1.replace(")", "R")
    smiles1 = smiles1.replace("[", "M")
    smiles1 = smiles1.replace("]", "Q")

    natoms0 = f[key]['natoms0'][:][0]
    natoms1 = f[key]['natoms1'][:][0]
    charge0 = f[key]['charge0'][:][0]
    charge1 = f[key]['charge1'][:][0]
    elements = f[key]['elements']
    coord = f[key]['xyz'][:]
    if smiles0 not in stored_smiles:
        openfile = open(smiles0 + '.xyz', 'w+')
        openfile.write(str(natoms0) + ' \n')
        openfile.write('0 1 ' + smiles0 + '\n')
        for i in range(natoms0):
            openfile.write(elements[i].decode() + '     '+str(coord[0,i,0])+'    '+str(coord[0,i,1])+'    '+str(coord[0,i,2])+'\n')
        openfile.close()
        stored_smiles.append(smiles0)
        charge[smiles0] = charge0
    if smiles1 not in stored_smiles:
        openfile = open(smiles1 + '.xyz', 'w+')
        openfile.write(str(natoms1) + ' \n')
        openfile.write('0 1 ' + smiles1 + '\n')
        for i in range(natoms1):
            openfile.write(elements[natoms0+i].decode() + '     '+str(coord[0,natoms0+i,0])+'    '+str(coord[0,natoms0+i,1])+'    '+str(coord[0,natoms0+i,2])+'\n')
        openfile.close()
        stored_smiles.append(smiles1)
        charge[smiles1] = charge1
        

In [None]:
# generate .mdl file
os.chdir(current_path + '/mdl')
os.system('rm *.mdl')
all_name = []
for root, dirs, files in os.walk(os.chdir(current_path + '/xyz')):
    for file in files:
        all_name.append(re.split(r'(.xyz)',file)[0])
        
for name in all_name:     
    os.system('obabel -ixyz ../xyz/' + name + '.xyz -omdl -O ' + name + '.mdl')
    
all_name1 = []
for root, dirs, files in os.walk(os.chdir(current_path + '/mdl')):
    for file in files:
        all_name1.append(re.split(r'(.mdl)',file)[0])
for i in all_name:
    if i not in all_name1:
        print(i)

In [None]:
# generate .ac file. For [F,Cl,Br,I], do it manually
all_name1 = []
for root, dirs, files in os.walk(os.chdir(current_path + '/mdl')):
    for file in files:
        name = re.split(r'(.mdl)',file)[0]
#         if 'Li' in name or 'Na' in name or 'Mg' in name or 'K' in name or 'Ca' in name or 'Ar' in name or 'He' in name or 'Kr' in name or 'Ne' in name or 'Xe' in name:
#             continue
        all_name1.append(name)

os.chdir(current_path + '/tmp')
for name in all_name1:
    os.system('antechamber -i ../mdl/' + name + '.mdl -fi mdl -o ' + name + '.ac -fo ac -pf y -c bcc -nc ' + str(charge[name]) + ' -at gaff2')

all_name2 = []
for root, dirs, files in os.walk(os.chdir(current_path + '/tmp')):
    for file in files:
        all_name2.append(re.split(r'(.ac)',file)[0])
for i in all_name1:
    if i not in all_name2:
        print(i)

In [None]:
# generate frcmod file
all_name3 = []
for root, dirs, files in os.walk(os.chdir(current_path + '/tmp')):
    for file in files:
        name = re.split(r'(.ac)',file)[0]
        all_name3.append(name)

os.chdir(current_path + '/tmp2')
for name in all_name3:
    os.system('parmchk2 -i ../tmp/' + name + '.ac -f ac -o ' + name)

all_name4 = []
for root, dirs, files in os.walk(os.chdir(current_path + '/tmp2')):
    for file in files:
        all_name4.append(file)
for i in all_name3:
    if i not in all_name4:
        print(i)

In [None]:
# combine the .ac files and frcmod files toghther
all_name3 = []
for root, dirs, files in os.walk(os.chdir(current_path + '/tmp')):
    for file in files:
        name = re.split(r'(.ac)',file)[0]
        all_name3.append(name)

os.chdir(current_path + '/GAFF_ac2')
for key in f.keys():
    smiles0 = f[key]['smiles0'][0].decode()
    smiles1 = f[key]['smiles1'][0].decode()
    smiles0 = smiles0.replace("(", "L")
    smiles0 = smiles0.replace(")", "R")
    smiles0 = smiles0.replace("[", "M")
    smiles0 = smiles0.replace("]", "Q")
    smiles1 = smiles1.replace("(", "L")
    smiles1 = smiles1.replace(")", "R")
    smiles1 = smiles1.replace("[", "M")
    smiles1 = smiles1.replace("]", "Q")
    if smiles0 in all_name3 and smiles1 in all_name3:
        ac_file = open(current_path + '/GAFF_ac2/' + key + '.ac', 'w+')
        ac_file.write(open('../tmp/'+smiles0+'.ac','r').read()+'\n')
        ac_file.write(open('../tmp/'+smiles1+'.ac','r').read()+'\n')
        ac_file.close()
    
        frcmod_file = open(current_path + '/frcmod/' + key, 'w+')
        frcmod_file.write(open('../tmp2/'+smiles0,'r').read()+'\n')
        frcmod_file.write(open('../tmp2/'+smiles1,'r').read()+'\n')
        frcmod_file.close()


In [None]:
# generate .sdf files for dimers
os.chdir(current_path + '/tmp')
os.system('rm *.xyz')
stored_smiles = []
charge = {}
for key in all_key:
    smiles0 = f[key]['smiles0'][0].decode()
    smiles1 = f[key]['smiles1'][0].decode()
    smiles0 = smiles0.replace("(", "L")
    smiles0 = smiles0.replace(")", "R")
    smiles0 = smiles0.replace("[", "M")
    smiles0 = smiles0.replace("]", "Q")
    smiles1 = smiles1.replace("(", "L")
    smiles1 = smiles1.replace(")", "R")
    smiles1 = smiles1.replace("[", "M")
    smiles1 = smiles1.replace("]", "Q")

    natoms0 = f[key]['natoms0'][:][0]
    natoms1 = f[key]['natoms1'][:][0]
    elements = f[key]['elements']
    coord = f[key]['xyz'][:]
    if smiles0 not in stored_smiles:
        openfile = open('tmp.xyz', 'w+')
        openfile.write(str(natoms0) + ' \n')
        openfile.write('0 1 ' + smiles0 + '\n')
        for i in range(natoms0):
            openfile.write(elements[i].decode() + '     '+str(coord[0,i,0])+'    '+str(coord[0,i,1])+'    '+str(coord[0,i,2])+'\n')
        openfile.close()
        stored_smiles.append(smiles0)
        os.system('obabel -ixyz ./tmp.xyz -osdf -O ../sdf/' + smiles0 + '.sdf')
        
    if smiles1 not in stored_smiles:
        openfile = open('tmp.xyz', 'w+')
        openfile.write(str(natoms1) + ' \n')
        openfile.write('0 1 ' + smiles1 + '\n')
        for i in range(natoms1):
            openfile.write(elements[natoms0+i].decode() + '     '+str(coord[0,natoms0+i,0])+'    '+str(coord[0,natoms0+i,1])+'    '+str(coord[0,natoms0+i,2])+'\n')
        openfile.close()
        stored_smiles.append(smiles1)
        os.system('obabel -ixyz ./tmp.xyz -osdf -O ../sdf/' + smiles1 + '.sdf')
        
os.chdir(current_path + '/sdf')
failed_smiles = []
for root, dirs, files in os.walk(current_path + '/sdf'):
    for file in files:
        mol = Chem.SDMolSupplier (file, removeHs=False)[0] 
        if mol is None:
            failed_smiles.append(re.split(r'(.sdf)',file)[0])
print(len(failed_smiles), 'molecules fail!')

os.chdir(current_path + '/tmp')

In [None]:
os.chdir(current_path + '/tmp')
os.system('rm *')
for i in range(len(failed_smiles)):
    aa = failed_smiles[i]
    aa = aa.replace("L", "(")
    aa = aa.replace("R", ")")
    aa = aa.replace("M", "[")
    aa = aa.replace("Q", "]")
    mol = pb.readstring('smiles', aa) 
    mol.make3D() 
    mol.write('sdf', failed_smiles + '.sdf', overwrite=True)
# For N atom, the charge is wrong,change it manually