In [1]:
valid_atomic_nums = list(range(1, 119)) + ['ukn']
valid_bond_types = ['SINGLE', 'DOUBLE', 'TRIPLE', 'AROMATIC', 'ukn']

valid_features_dic = {'atomomic_nums': valid_atomic_nums, 'bond_types': valid_bond_types}

def safe_index(l, e):
    """
    Return index of element e in list l. If e is not present, return the last index
    """
    try:
        return l.index(e)
    except:
        return len(l) - 1

In [6]:
from rdkit import Chem
import numpy as np

def smiles2graph(smiles):

    mol = Chem.MolFromSmiles(smiles)

    if mol is None:
        return None

    atom_atomic_nums = []
    for atom in mol.GetAtoms():
        atom_atomic_nums.append(safe_index(valid_features_dic['atomomic_nums'], atom.GetAtomicNum()))
    atom_atomic_nums = np.array(atom_atomic_nums, dtype=np.int64)

    if len(mol.GetBonds()) > 0:  # mol has bonds
        edges_list = []
        edge_features_list = []
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()

            edge_feature = str(bond.GetBondType())
            edge_feature = safe_index(valid_features_dic['bond_types'], edge_feature)

            # add edges in both directions
            edges_list.append((i, j))
            edge_features_list.append(edge_feature)
            edges_list.append((j, i))
            edge_features_list.append(edge_feature)

        # data.edge_index: Graph connectivity in COO format with shape [2, num_edges]
        edge_index = np.array(edges_list, dtype=np.int64).T

        # data.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]
        edge_attr = np.array(edge_features_list, dtype=np.int64)

    else:  # mol has no bonds
        edge_index = np.empty((2, 0), dtype=np.int64)
        edge_attr = np.empty((0, 1), dtype=np.int64)

    return atom_atomic_nums, edge_index, edge_attr

In [8]:
import pandas as pd 

smiles_list = pd.read_csv('data/HIV.csv')['smiles'].values
smiles_list = smiles_list

for smiles in smiles_list:
    graph = smiles2graph(smiles)
    if graph is None:
        print(smiles)
    

[14:06:15] Explicit valence for atom # 3 Al, 6, is greater than permitted
[14:06:16] Explicit valence for atom # 5 B, 5, is greater than permitted


O=C1O[Al]23(OC1=O)(OC(=O)C(=O)O2)OC(=O)C(=O)O3
Cc1ccc([B-2]2(c3ccc(C)cc3)=NCCO2)cc1


[14:06:19] Explicit valence for atom # 16 Al, 9, is greater than permitted


Oc1ccc(C2Oc3cc(O)cc4c3C(=[O+][AlH3-3]35([O+]=C6c7c(cc(O)cc7[OH+]3)OC(c3ccc(O)cc3O)C6O)([O+]=C3c6c(cc(O)cc6[OH+]5)OC(c5ccc(O)cc5O)C3O)[OH+]4)C2O)c(O)c1


[14:06:20] Explicit valence for atom # 4 Al, 9, is greater than permitted


CC1=C2[OH+][AlH3-3]34([O+]=C2C=CN1C)([O+]=C1C=CN(C)C(C)=C1[OH+]3)[O+]=C1C=CN(C)C(C)=C1[OH+]4


[14:06:24] Explicit valence for atom # 12 Al, 7, is greater than permitted
[14:06:24] Explicit valence for atom # 13 Al, 7, is greater than permitted


CC(c1cccs1)=[N+]1[N-]C(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
CC(c1ccccn1)=[N+]1[N-]C(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2


[14:06:25] Explicit valence for atom # 6 Ge, 5, is greater than permitted


[Na+].c1ccc([SH+][GeH2+]2[SH+]c3ccccc3[SH+]2)c([SH+][GeH2+]2[SH+]c3ccccc3[SH+]2)c1


In [None]:
import os
import lmdb
from tqdm import tqdm
from multiprocessing import Pool

def write_lmdb(smiles_list, outpath='lmdb_file', nthreads=8):
    os.makedirs(outpath, exist_ok=True)
    output_name = os.path.join(outpath,'.lmdb')
    try:
        os.remove(output_name)
    except:
        pass
    env_new = lmdb.open(
        output_name,
        subdir=False,
        readonly=False,
        lock=False,
        readahead=False,
        meminit=False,
        max_readers=1,
        map_size=int(100e9),
    )
    txn_write = env_new.begin(write=True)
    with Pool(nthreads) as pool:
        i = 0
        for inner_output in tqdm(pool.imap(smiles2graph, smiles_list)):
            if inner_output is not None:
                txn_write.put(f'{i}'.encode("ascii"), inner_output)
                i += 1
        print('{} lines'.format(i))
        txn_write.commit()
        env_new.close()

write_lmdb(['CCO', 'CCN', 'CCF', 'CCCl'], outpath='lmdb_file', nthreads=8)

0it [00:00, ?it/s]

In [None]:
import deepchem as dc

# creation of demo data set with some smiles strings

def splitter(smiles):
    return smiles.split()




Xs = np.zeros(len(data_test))

Ys = np.ones(len(data_test))

# creation of a deepchem dataset with the smile codes in the ids field

dataset = dc.data.DiskDataset.from_numpy(X=Xs,y=Ys,w=np.zeros(len(data_test)),ids=data_test)

scaffoldsplitter = dc.splits.ScaffoldSplitter()

train,test = scaffoldsplitter.train_test_split(dataset)

In [None]:
smiles = ["CC(C)Cl" , "CCC(C)CO" ,  "CCCCCCCO" , "CCCCCCCC(=O)OC" , "c3ccc2nc1ccccc1cc2c3" , "Nc2cccc3nc1ccccc1cc23" , "C1CCCCCC1" ]


