In [1]:
import torch
import torch_geometric as pyg
from torch.nn import Parameter
import torch.nn.functional as F
from torch_geometric.datasets import QM9
from torch_geometric.nn import GCNConv, NNConv
from torch_geometric.nn.conv import GATv2Conv
from torch_geometric.nn.models import MLP
from torch_geometric.loader import DataLoader
from torch_geometric.transforms import NormalizeFeatures
from torch.utils.data import random_split
import matplotlib.pyplot as plt
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
import rdkit
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem
from rdkit.Chem.rdchem import BondType, HybridizationType
import os

def ECFPGen(smiles, radius=4, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    morgan = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
    array = np.zeros(nBits)
    rdkit.DataStructs.ConvertToNumpyArray(morgan, array)
    return np.nonzero(array)
morgan = ECFPGen("C")


In [9]:
# Create Dataset
# reference source code 
# https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/datasets/qm9.html
# neural fingerprint on pytorch ref
# https://qiita.com/kimisyo/items/55a01e27aa03852d84e9


#RDLogger.DisableLog("rdApp.*")

# units conversion
HAR2EV = 27.211386246
KCALMOL2EV = 0.04336414
conversion = torch.tensor([
1., 1., HAR2EV, HAR2EV, HAR2EV, 1., HAR2EV, HAR2EV, HAR2EV, HAR2EV, HAR2EV,
1., KCALMOL2EV, KCALMOL2EV, KCALMOL2EV, KCALMOL2EV, 1., 1., 1.
])

atomrefs = {
    6: [0., 0., 0., 0., 0.],
    7: [
        -13.61312172, -1029.86312267, -1485.30251237, -2042.61123593,
        -2713.48485589
    ],
    8: [
        -13.5745904, -1029.82456413, -1485.26398105, -2042.5727046,
        -2713.44632457
    ],
    9: [
        -13.54887564, -1029.79887659, -1485.2382935, -2042.54701705,
        -2713.42063702
    ],
    10: [
        -13.90303183, -1030.25891228, -1485.71166277, -2043.01812778,
        -2713.88796536
    ],
    11: [0., 0., 0., 0., 0.],
}

class MyFirstDataset(pyg.data.Dataset):
    def __init__(self, root="./MyFirstDataset", transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)

    @property
    def processed_file_names(self):
        return "qm9.pt"


    def process(self):
        types = {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4}
        bonds = {BondType.SINGLE: 0, BondType.DOUBLE: 1, BondType.TRIPLE: 2, BondType.AROMATIC: 3}
        
        # グラフの特徴量
        self.data = pd.read_csv("qm9_dataset.csv")
            # 列の並べ替え(reindex)
        self.data = self.data.reindex(index=["mol_id", "smiles", "alpha", "homo", "lumo", "gap", "r2", "zpve", "u0", "u298", "h298", "g298", "cv", "u0_atom", "u298_atom", "h298_atom", "g298_atom", "A", "B", "C"])
        
        target = []
        for name, i in df.iterrows():
            tmp = [float(x) for x in i.values[2:]]
        target.append(tmp)
        target = torch.tensor(target, dtype=torch.float)
        target = torch.cat([target[:, 3:], target[:, 3:]], dim=0) # ?
        target = target * conversion.view(1, -1)

        i = 0
        for index, mol in self.data.iterrows():
            mol_obj = Chem.MolFromSmiles(mol["smiles"])
            N = mol_obj.GetNumAtoms()
            conf = mol.GetConformer()
            pos = conf.GetPositions()
            pos = torch.tensor(pos, dtype=torch.float)

            type_idx = atomic_number = aromatic = sp = sp2 = sp3 = num_hs = [], [], [], [], [], [], []
            for atom in mol_obj.GetAtoms():
                type_idx.append(types[atom.GetSymbol()])
                atomic_number.append(atom.GetAtomicNum())
                if atom.GetIsAromatic():
                    aromatic.append(1)
                else:
                    aromatic.append(0)
                hybridization = atom.GetHybridization()
                sp.append(1 if hybridization == HybridizationType.SP else 0)
                sp2.append(1 if hybridization == HybridizationType.SP2 else 0)
                sp3.append(1 if hybridization == HybridizationType.SP3 else 0)
            z = torch.tensor(atomic_number, dtype=torch.long)
            
            # edge index
            row, col, edge_type = [], [], []
            for bond in mol_obj.GetBonds():
                start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
                row += [start, end]
                col += [end, start]
                edge_type += 2 * [bonds[bond.GetBondType()]]
            
            edge_index = torch.tensor([row, col], dtype=torch.long)
            edge_type = torch.tensor(edge_type, dtype=torch.long)
            edge_attr = one_hot(edge_type, num_classes=len(bonds))

            perm = (edge_index[0] * N + edge_index[1]).argsort()
            edge_index = edge_index[:, perm]
            edge_type = edge_type[perm]
            edge_attr = edge_attr[perm]

            row, col = edge_index
            hs = (z == 1).to(torch.float)
            num_hs = scatter(hs[row], col, dim_size=N, reduce='sum').tolist()
            # types: qm9に含まれる原子の一覧(H,C,N,O,F)。num_classesは原子の種類の数
            # x1:原子記号のリスト
            x1 = one_hot(torch.tensor(type_idx), num_classes=len(types))
            
            # node features: 原子番号、芳香性、混成の有無(sp,sp2,sp3)、水素の数
            x2 = torch.tensor([atomic_number, aromatic, sp, sp2, sp3, num_hs],
                              dtype=torch.float).t().contiguous()
            # x1,x2を結合
            x = torch.cat([x1, x2], dim=-1)
            #グラフ特徴量
            y = target[i].unsqueeze(0)
            i += 1
            name = mol_obj.GetProp("_Name")

            data = Data(x=x, )

            # node features
            node_features = self._get_node_features(mol_obj)
            # edge features
            edge_features = self._get_edge_features(mol_obj)
            # edge inde
            edge_index = self._get_adjacency_info(mol_obj)
            # label
            label = self.__get__labels(mol["alpha"])
            # create dataset
            data = pyg.Data(x=node_features,
                            edge_index=edge_index,
                            edge_attr=edge_features,
                            y=label,
                            smiles=mol["smiles"])
            torch.save(Data, os.path.join(self.processed_dir, f"data_{index}.pt"))
    
    def _get_node_features(self, mol):
        for atom in mol.GetAtoms():
            node_features = []
            node_features.append(atom.GetAtomicNum())
            node_features.append(atom.GetDegree())
            node_features.append(GetFormalCharge())


    


    


In [10]:
MyFirstDataset.ECFPGen("CC")

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [4]:
qm9_df = pd.read_csv("qm9_dataset.csv")
qm9_df["ECFP"] = [ECFPGen(smiles, radius=4, nBits=2048) for smiles in qm9_df["smiles"]]
qm9_df.head() #r=4,n=2048 1m25.7sで全部生成できた

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,...,u0,u298,h298,g298,cv,u0_atom,u298_atom,h298_atom,g298_atom,ECFP
0,gdb_1,C,157.7118,157.70997,157.70699,0.0,13.21,-0.3877,0.1171,0.5048,...,-40.47893,-40.476062,-40.475117,-40.498597,6.469,-395.999595,-398.64329,-401.014647,-372.471772,"([1264],)"
1,gdb_2,N,293.60975,293.54111,191.39397,1.6256,9.46,-0.257,0.0829,0.3399,...,-56.525887,-56.523026,-56.522082,-56.544961,6.316,-276.861363,-278.620271,-280.399259,-259.338802,"([930],)"
2,gdb_3,O,799.58812,437.90386,282.94545,1.8511,6.31,-0.2928,0.0687,0.3615,...,-76.404702,-76.401867,-76.400922,-76.422349,6.002,-213.087624,-213.974294,-215.159658,-201.407171,"([790],)"
3,gdb_4,C#C,0.0,35.610036,35.610036,0.0,16.28,-0.2845,0.0506,0.3351,...,-77.308427,-77.305527,-77.304583,-77.327429,8.574,-385.501997,-387.237686,-389.016047,-365.800724,"([915, 1119],)"
4,gdb_5,C#N,0.0,44.593883,44.593883,2.8937,12.99,-0.3604,0.0191,0.3796,...,-93.411888,-93.40937,-93.408425,-93.431246,6.278,-301.820534,-302.906752,-304.091489,-288.720028,"([489, 915, 1384],)"


In [8]:
df = pd.read_csv("./AR_ALL.csv", low_memory=False)
df.keys

<bound method NDFrame.keys of       CMPD_CHEMBLID  MOLREGNO PARENT_CMPD_CHEMBLID  PARENT_MOLREGNO  \
0         CHEMBL139     29890            CHEMBL139            29890   
1         CHEMBL139     29890            CHEMBL139            29890   
2         CHEMBL141     33986            CHEMBL141            33986   
3         CHEMBL141     33986            CHEMBL141            33986   
4         CHEMBL640     27341            CHEMBL640            27341   
...             ...       ...                  ...              ...   
51190  CHEMBL329342    141865         CHEMBL329342           141865   
51191  CHEMBL331372    192029         CHEMBL331372           192029   
51192  CHEMBL331372    192029         CHEMBL331372           192029   
51193  CHEMBL331382    192098         CHEMBL331382           192098   
51194  CHEMBL331382    192098         CHEMBL331382           192098   

      MOL_PREF_NAME  COMPOUND_KEY  MOLWEIGHT  ALOGP     PSA  \
0        DICLOFENAC    DICLOFENAC     296.15   4.36   

In [None]:
for i in qm9_df["smiles"]:
    print(i)