In [15]:
from OGB_mol import smiles2graph
import pandas as pd

df = pd.read_csv('data/QM9.csv')
smiles = df['smiles'].values

In [17]:
s = smiles[0]
g = smiles2graph(s)
print(g)

{'edge_index': array([[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 4],
       [1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 4, 8]]), 'edge_feat': array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [1, 0, 1],
       [1, 0, 1],
       [3, 0, 1],
       [3, 0, 1],
       [3, 0, 1],
       [3, 0, 1],
       [3, 0, 1],
       [3, 0, 1],
       [3, 0, 1],
       [3, 0, 1],
       [3, 0, 1],
       [3, 0, 1]]), 'node_feat': array([[5, 0, 4, 5, 3, 0, 2, 0, 0],
       [5, 0, 4, 5, 2, 0, 2, 0, 0],
       [5, 0, 4, 5, 2, 0, 2, 0, 0],
       [6, 0, 2, 5, 0, 0, 1, 0, 0],
       [5, 0, 3, 5, 0, 0, 1, 1, 1],
       [7, 0, 2, 5, 0, 0, 1, 1, 1],
       [5, 0, 3, 5, 1, 0, 1, 1, 1],
       [6, 0, 2, 5, 0, 0, 1, 1, 1],
       [7, 0, 2, 5, 0, 0, 1, 1, 1]]), 'num_nodes': 9}


In [2]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

def smi2_2Dcoords(smi):
    mol = Chem.MolFromSmiles(smi)
    mol = AllChem.AddHs(mol)
    AllChem.Compute2DCoords(mol)
    coordinates = mol.GetConformer().GetPositions().astype(np.float32)
    len(mol.GetAtoms()) == len(coordinates), "2D coordinates shape is not align with {}".format(smi)
    return coordinates

smi = 'CC1=C(C(=O)OC2CCCC2)[C@H](c2ccccc2OC(C)C)C2=C(O)CC(C)(C)CC2=[N+]1'

print(smi2_2Dcoords(smi))

[[-1.5280366  -3.4118583   0.        ]
 [-0.32651597 -2.5138893   0.        ]
 [-0.50341964 -1.0243574   0.        ]
 [-1.8818438  -0.43279454  0.        ]
 [-2.1646101   0.31253067  0.        ]
 [-3.0833645  -1.3307635   0.        ]
 [-4.4617887  -0.73920065  0.        ]
 [-5.687098    0.12602569  0.        ]
 [-6.888619   -0.7719433   0.        ]
 [-6.40589    -2.1921449   0.        ]
 [-4.9060264  -2.1719089   0.        ]
 [ 0.698101   -0.12638843  0.        ]
 [-0.3948677   0.9009477   0.        ]
 [-1.8310512   0.46807712  0.        ]
 [-2.9240198   1.4954132   0.        ]
 [-2.580805    2.9556198   0.        ]
 [-1.1446215   3.3884904   0.        ]
 [-0.05165285  2.3611543   0.        ]
 [ 1.3845307   2.794025    0.        ]
 [ 1.7277455   4.2542315   0.        ]
 [ 0.26753882  4.5974464   0.        ]
 [ 3.1879523   3.9110167   0.        ]
 [ 2.0765252  -0.7179513   0.        ]
 [ 3.278046    0.18001767  0.        ]
 [ 3.1011422   1.6695495   0.        ]
 [ 4.65647    -0.41154522

In [5]:
from ogb.graphproppred import GraphPropPredDataset

# Download and process data at './dataset/ogbg_molhiv/'
dataset = GraphPropPredDataset(name = "ogbg-molhiv", root = 'dataset/')
 
split_idx = dataset.get_idx_split() 

train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

### set i as an arbitrary index
i = 0
graph, label = dataset[i] # graph: library-agnostic graph object

  loaded_dict = torch.load(pre_processed_file_path, 'rb')


In [3]:
import deepchem as dc
import numpy as np
from deepchem.feat import Featurizer
from rdkit import Chem
from rdkit.Chem import rdmolfiles
from rdkit.Chem import rdmolops
from rdkit.Chem.rdchem import Mol

class MyMolecularFeaturizer(Featurizer):

    def __init__(self, use_original_atoms_order=False):
        self.use_original_atoms_order = use_original_atoms_order

    def featurize(self, datapoints, **kwargs) -> np.ndarray:
 
        # Special case handling of single molecule
        if isinstance(datapoints, str) or isinstance(datapoints, Mol):
            datapoints = [datapoints]
        else:
            # Convert iterables to list
            datapoints = list(datapoints)

        features: list = []
        for i, mol in enumerate(datapoints):

            if isinstance(mol, str):
                mol = Chem.MolFromSmiles(mol)
            print(mol)
            mol = Chem.SanitizeMol(mol, catchErrors=True)

            if not(self.use_original_atoms_order):
                new_order = rdmolfiles.CanonicalRankAtoms(mol)
                mol = rdmolops.RenumberAtoms(mol, new_order)

            features.append(self._featurize(mol))
      
        return np.asarray(features)
     
class EmptyFeaturizer(MyMolecularFeaturizer):
    def _featurize(self, mol: Mol) -> np.ndarray:
        return np.array([])

    
hiv_tasks, hiv_datasets, transformers = dc.molnet.load_hiv(featurizer=EmptyFeaturizer(use_original_atoms_order=True),
                                                           data_dir = 'data/hiv_data/data_dir', 
                                                           save_dir= 'data/hiv_data/save_dir')
train_dataset, valid_dataset, test_dataset = hiv_datasets

# Extract SMILES strings from the dataset
smiles_strings = train_dataset.ids
print(smiles_strings)

<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A904A0A0>
<rdkit.Chem.rdchem.Mol object at 0x000001D4A90

[13:48:38] Explicit valence for atom # 3 Al, 6, is greater than permitted


ArgumentError: Python argument types in
    rdkit.Chem.rdmolops.SanitizeMol(NoneType)
did not match C++ signature:
    SanitizeMol(class RDKit::ROMol {lvalue} mol, unsigned __int64 sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, bool catchErrors=False)

In [25]:
array1 = np.array([0])
array2 = np.array([1])
np.asarray([array1, np.array([]), array2])

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3,) + inhomogeneous part.