# Scaffold splitting

In [2]:
import deepchem as dc
import numpy as np

# creation of demo data set with some smiles strings

data_test= ["CC(C)Cl" , "CCC(C)CO" ,  "CCCCCCCO" , "CCCCCCCC(=O)OC" , "O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2ccccc2C)C)CC1(C)C","c3ccc2nc1ccccc1cc2c3" , "Nc2cccc3nc1ccccc1cc23" , "C1CCCCCC1" ]

Xs = np.arange(len(data_test))

Ys = np.ones(len(data_test))

# creation of a deepchem dataset with the smile codes in the ids field

dataset = dc.data.DiskDataset.from_numpy(X=Xs,y=Ys,w=np.zeros(len(data_test)),ids=data_test)

scaffoldsplitter = dc.splits.ScaffoldSplitter()
train,valid, test = scaffoldsplitter.split(dataset=dataset, frac_train= 0.8, frac_valid=0.1, frac_test= 0.1 )


scaffoldsplitter.generate_scaffolds(dataset), train, valid, test

([[0, 1, 2, 3], [5, 6], [7], [4]], [0, 1, 2, 3, 5, 6], [7], [4])

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdDistGeom
from typing import List, Tuple
import numpy as np
import rdkit
import pandas as pd


def get_3d_coords_from_smiles(smiles:str, add_hydrogen:bool=True, seed=0xf00d):

    m = Chem.MolFromSmiles(smiles)
    # necessary to add hydrogen for consistent conformer generation
    m = Chem.AddHs(m)

    ## 3D conformer generation
    ps = rdDistGeom.ETKDGv3()
    ps.randomSeed = seed
    #ps.coordMap = coordMap = {0:[0,0,0]}
    AllChem.EmbedMolecule(m,ps)


    conf = m.GetConformer()

    ## if we dont want hydrogen, we need to rebuild a molecule without explicit hydrogens
    if not(add_hydrogen):
        m = Chem.MolFromSmiles(smiles)
    
    sub = m.GetSubstructMatch(m)

    ## intiliazing properties dictionary with the 3d coordinates of each atom
    properties = {i:{"atom_name":m.GetAtoms()[s].GetSymbol(), "coordinates":np.array(conf.GetAtomPosition(s)), "bond_adjacency_list":[]} for i,s in enumerate(sub)}

    ## creating adjacency list for bonds 
    for bond in m.GetBonds():
        properties[bond.GetBeginAtomIdx()]["bond_adjacency_list"].append((bond.GetEndAtomIdx(), str(bond.GetBondType())))

    return properties


#
def translate_properties(properties):
    """Translate all 3d coordinates such that the first atom has coordinates (0,0,0)

    Args:
        properties (_type_): _description_

    Returns:
        _type_: _description_
    """
    from copy import deepcopy

    first_atom_coordinates = properties[0]["coordinates"]
    normalized_properties = deepcopy(properties)

    for k in properties:
        normalized_properties[k]["coordinates"] = normalized_properties[k]["coordinates"] - first_atom_coordinates

    return normalized_properties



def print_coordinates(properties):
    """Print 3D coordinates of each atom of a molecule
    """

    for k in properties:
        print(f"Name: {properties[k]['atom_name']}{k} Coords:{properties[k]['coordinates']}")


smiles = 'CC(C)C'

properties = get_3d_coords_from_smiles(smiles=smiles, seed=3, add_hydrogen=True)

print_coordinates(properties=properties)

normalized_properties = translate_properties(properties=properties)

print_coordinates(normalized_properties)


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdDistGeom
from typing import List, Tuple
import numpy as np
import rdkit
import pandas as pd
from pymatgen.core import Structure, Lattice, Molecule, Element


from pymatgen.io.babel import BabelMolAdaptor

from pymatgen.core.lattice import Lattice


%load_ext autoreload
%autoreload 2


smiles = 'CC(C)C'

m = Chem.MolFromSmiles(smiles, sanitize = False)
# necessary to add hydrogen for consistent conformer generation

problems = Chem.DetectChemistryProblems(m)
print(len(problems))



struct = Structure(
    Lattice.cubic(4.2),
    ["Na", "K", "K"],
    [[0, 0, 0], [0.5, 0.5, 0.5], [0,0,0.5]],
    site_properties={"magmom": [-2, 2,2]},
)





In [None]:
from pymatgen.core import Element

for atom in m.GetAtoms():
    z = atom.GetAtomicNum()
    
    elem= Element.from_Z(z)
    print(getattr(elem, "X"))

In [None]:
import json
import gzip
from pymatgen.core.structure import Structure, Molecule

from utils import download_url

#raw_url= "https://ml.materialsproject.org/projects/matbench_mp_is_metal.json.gz"
#raw_dir= "../data/matbench/mp_is_metal"
#download_url(raw_url, raw_dir)


json_filename = "../old_data/matbench/mp_is_metal/raw/matbench_mp_is_metal.json.gz"


with gzip.open(json_filename, 'r') as fin:        # 4. gzip
    json_bytes = fin.read()                      # 3. bytes (i.e. UTF-8)

json_str = json_bytes.decode('utf-8')            # 2. string (i.e. JSON)
data = json.loads(json_str) 


In [None]:
from utils import from_structure_to_molecule
from pymatgen.io.babel import BabelMolAdaptor

from pymatgen.core.lattice import Lattice

for i in range(len(data["data"])):
    struct = Structure.from_dict(data["data"][i][0])


    #then the following conversion : pymatgen.Structure -> pymatgen.Molecule -> pybel_mol -> mol file (to retain 3D information) ->  rdkit molecule
    mol = Molecule(species=struct.species, coords=struct.cart_coords)
    adaptor = BabelMolAdaptor(mol).pybel_mol
    #ideally, we would like to give the correct 3D coordinates to the molecule, so we use .mol file
    mol_file = adaptor.write('mol')

    new_mol = Chem.MolFromMolBlock(mol_file, sanitize=False)
    problems = Chem.DetectChemistryProblems(new_mol)
    len_problems=len(problems)
    
    new_mol.UpdatePropertyCache(strict=False)
    Chem.SanitizeMol(new_mol,Chem.SanitizeFlags.SANITIZE_FINDRADICALS|Chem.SanitizeFlags.SANITIZE_KEKULIZE|Chem.SanitizeFlags.SANITIZE_SETAROMATICITY|Chem.SanitizeFlags.SANITIZE_SETCONJUGATION|Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION|Chem.SanitizeFlags.SANITIZE_SYMMRINGS,catchErrors=True)
    problems = Chem.DetectChemistryProblems(new_mol)
    len_problems=len(problems)
    if len_problems > 0:
        print(f"index {i}, problems {problems}")