In [3]:
import os
from glob import glob

import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem.rdchem import RWMol
from rdkit import Chem, RDLogger
from rdkit.Chem.rdchem import BondType as BT
from rdkit.Chem.rdchem import HybridizationType

import torch
import torch.nn.functional as F

from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
from torch_scatter import scatter

In [10]:
smile_csv = pd.read_csv('../data/train_set.ReorgE.csv', index_col=0)

In [11]:
smile_csv

Unnamed: 0_level_0,SMILES,Reorg_g,Reorg_ex
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
train_0,CC[C@H]1CCCCN1C(=O)[C@@H](C)OC(=O)c1c(C)oc(-n2...,0.631486,0.535060
train_1,O[C@@H](CNC1CC1)CN1CCc2sccc2C1,0.825901,1.116781
train_2,N#CCCNC(=O)[C@@]1(O)CCSC1,1.463943,0.964848
train_3,COC[C@H]1CN(c2ccc(OCC[C@@H](C)O)cc2)C(=O)O1,0.166669,0.161458
train_4,N#Cc1c(-c2ccccc2OCC(N)=O)[nH]c(C(N)=O)c1N,0.313820,0.338862
...,...,...,...
train_18152,CC(=O)Nc1ccc2ccc3cccc4ccc1c2c34,0.146917,0.143084
train_18153,CC(C)(C)c1ccccc1N(c1ccccc1)c1ccc(S(=O)(=O)c2cc...,0.612898,0.500668
train_18154,CN(C)c1ccc(C(=O)Nc2ccccc2)cc1,1.218777,1.048954
train_18155,c1ccc(N(c2ccccc2)c2ccc(-c3ncc(-c4ccc(-c5cnc(-c...,0.145292,0.182589


In [34]:
smile_csv.iloc[0].Reorg_g

0.631486

In [2]:
train_data_dirs = '../data/mol_files/train_set'
test_data_dirs = '../data/mol_files/test_set'

In [4]:
train_data = glob(train_data_dirs+'/*.mol')

In [7]:
test_data = glob(test_data_dirs+'/*.mol')

In [16]:
atomic_number = []

for m_dir in train_data:
    m = Chem.MolFromMolFile(m_dir)
    
    for atom in m.GetAtoms():
        atomic_number.append(atom.GetAtomicNum())

In [17]:
set(atomic_number)

{5, 6, 7, 8, 9, 14, 15, 16, 17, 35, 53}

In [20]:
bond_type = []

for m_dir in train_data:
    m = Chem.MolFromMolFile(m_dir)
    
    for bond in m.GetBonds():
        b = bond.GetBondType()
        
        bond_type.append(b)

In [21]:
set(bond_type)

{rdkit.Chem.rdchem.BondType.SINGLE,
 rdkit.Chem.rdchem.BondType.DOUBLE,
 rdkit.Chem.rdchem.BondType.TRIPLE,
 rdkit.Chem.rdchem.BondType.AROMATIC}

In [28]:
types = {'H': 0, 'B': 1, 'C': 2, 'N': 3, 'O': 4, 'F': 5, 'Si': 6, 'P': 7, 'S': 8, 'Cl': 9, 'Br': 10, 'I': 11}
bonds = {BT.SINGLE: 0, BT.DOUBLE: 1, BT.TRIPLE: 2, BT.AROMATIC: 3}

In [47]:
dataset = []

for m_dir in train_data:
    temp = m_dir.split('/')[-1].split('_')
    index = int(temp[1])
    if 'g' in temp[-1]:
        target = smile_csv.iloc[index].Reorg_g
    else:
        target = smile_csv.iloc[index].Reorg_ex
        
    m = Chem.MolFromMolFile(m_dir)
    
    N = m.GetNumAtoms()
    name = m.GetProp('_Name')
    
    conf = m.GetConformer()
    pos = conf.GetPositions()
    pos = torch.tensor(pos, dtype=torch.float)
    
    type_idx = []
    atomic_number = []
    aromatic = []
    sp = []
    sp2 = []
    sp3 = []
    num_hs = []
    for atom in m.GetAtoms():
        type_idx.append(types[atom.GetSymbol()])
        atomic_number.append(atom.GetAtomicNum())
        aromatic.append(1 if atom.GetIsAromatic() else 0)
        hybridization = atom.GetHybridization()
        sp.append(1 if hybridization == HybridizationType.SP else 0)
        sp2.append(1 if hybridization == HybridizationType.SP2 else 0)
        sp3.append(1 if hybridization == HybridizationType.SP3 else 0)
    
    z = torch.tensor(atomic_number, dtype=torch.long)
    
    
    row, col, edge_type = [], [], []

    for bond in m.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        row += [start, end]
        col += [end, start]
        edge_type += 2 * [bonds[bond.GetBondType()]]
        
    edge_index = torch.tensor([row, col], dtype=torch.long)
    edge_type = torch.tensor(edge_type, dtype=torch.long)
    edge_attr = F.one_hot(edge_type,
                          num_classes=len(bonds)).to(torch.float)
    
    perm = (edge_index[0] * N + edge_index[1]).argsort()
    edge_index = edge_index[:, perm]
    edge_type = edge_type[perm]
    edge_attr = edge_attr[perm]
    
    row, col = edge_index
    hs = (z == 1).to(torch.float)
    num_hs = scatter(hs[row], col, dim_size=N).tolist()
    
    x1 = F.one_hot(torch.tensor(type_idx), num_classes=len(types))
    x2 = torch.tensor([atomic_number, aromatic, sp, sp2, sp3, num_hs],
                      dtype=torch.float).t().contiguous()
    x = torch.cat([x1.to(torch.float), x2], dim=-1)
    
    data = Data(x=x, z=z, pos=pos, edge_index=edge_index,
                        edge_attr=edge_attr, y=target, idx=index)
    
    dataset.append(data)

In [48]:
loader = DataLoader(dataset, batch_size=1)

In [53]:
from torch_geometric.nn import DimeNet

In [54]:
model = DimeNet(hidden_channels=128, out_channels=1, num_blocks=3, num_bilinear=3, num_spherical=3, num_radial=3)

RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.