In [7]:
import torch_geometric
from torch_geometric import data
from torch_geometric.utils import from_smiles
import pandas as pd
import torch
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem


In [13]:
train_df = pd.read_csv('data/hERg_train.csv')
test_df = pd.read_csv('data/hERg_test.csv')

In [14]:
complex_smiles_train = []
complex_smiles_test = []
train = []
for index, row in train_df.iterrows():
    smiles = row['Smiles']
		# torch geometric의 from_smiles함수를 통해서 graph 형태로 변환
    data = from_smiles(smiles, with_hydrogen  = False)
    data.y = torch.tensor(row['pIC50']).unsqueeze(-1)

    # 추가적으로 각 원자의 좌표값을 얻기 위한 과정
    # 좌표 값을 사용하지 않을 거라면 위의 코드만으로 가능합니다
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol) # 수소 원자가 없으면 계산이 되지 않는 구조가 많아서 추가

    status = AllChem.EmbedMolecule(mol)
    if status == -1:
        complex_smiles_train.append(smiles)
        continue
    status = AllChem.UFFOptimizeMolecule(mol)

    mol = Chem.RemoveHs(mol) # 수소원자 제거
    conformer = mol.GetConformer()
    coordinates = conformer.GetPositions()
    coordinates = np.array(coordinates)
    data.pos = torch.tensor(coordinates).float() # pos라는 좌표값 정보를 graph data에 추가
    
    train.append(data)

test = []
for index, row in test_df.iterrows():
    smiles = row['Smiles']
    data = from_smiles(smiles, with_hydrogen  = False)
    data.y = torch.tensor(row['pIC50']).unsqueeze(-1)

    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)

    status = AllChem.EmbedMolecule(mol)
    if status == -1:
        complex_smiles_test.append(smiles)
        continue
    status = AllChem.UFFOptimizeMolecule(mol)

    mol = Chem.RemoveHs(mol)
    conformer = mol.GetConformer()
    coordinates = conformer.GetPositions()
    coordinates = np.array(coordinates)
    data.pos = torch.tensor(coordinates).float()
    
    test.append(data)

In [15]:
data

Data(x=[50, 9], edge_index=[2, 108], edge_attr=[108, 3], smiles='COc1ncc(-c2ccc(C(=O)O)cc2C)cc1-c1ccc(C(F)(F)F)cc1CN1C(=O)O[C@H](c2cc(C(F)(F)F)cc(C(F)(F)F)c2)[C@@H]1C', y=[1], pos=[50, 3])

In [27]:
torch.save(train,'data/train_graph')
torch.save(test,'data/test_graph_add')

In [16]:
complex_smiles_train

['O=C(O[C@@H]1C[C@@H]2C[C@H]3C[C@H](C1)N2CC3=O)c1c[nH]c2ccccc12',
 'CO[C@@]12CC[C@@]3(C[C@@H]1[C@](C)(O)C(C)(C)C)[C@H]1Cc4ccc(O)c5c4[C@@]3(CCN1CC1CC1)[C@H]2O5']

In [17]:

complex_smiles_test

[]