In [1]:
import ast
import numpy as np
import rdkit
from rdkit import Chem
from rdkit.Chem import BondType
import torch
import pandas as pd
import pickle

# 필요한 함수 정의

1. 동적 matrix(adjacency, feature) 생성 함수

In [2]:
def smiles_to_graph(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    
    num_atoms = molecule.GetNumAtoms()
    
    adjacency = np.zeros((num_atoms, num_atoms), 'int64')
    features = np.zeros((num_atoms), 'int64')
    
    for atom in molecule.GetAtoms():
        i = atom.GetIdx()
        atom_type = atom_mapping[atom.GetSymbol()]
        features[i] = atom_type
        
        for neighbor in atom.GetNeighbors():
            j = neighbor.GetIdx()
            bond = molecule.GetBondBetweenAtoms(i, j)
            bond_type_idx = bond_mapping[bond.GetBondType().name]
            adjacency[[i, j], [j, i]] = bond_type_idx
            
    adjacency = torch.tensor(adjacency.tolist(), dtype=torch.int8)
    features = torch.tensor(features.tolist(), dtype=torch.int8)
    
    return adjacency, features

2. adjacency, features 정보로 분자 복구하는 함수

In [3]:
def graph_to_molecule(features, adjacency):
    molecule = Chem.RWMol()  # 편집 가능한 빈 molecule 추가
    num_atoms = len(features)
    atom_index_map = {}
    
    # 'features' 이용해서 molecule에 atom 추가
    for i in range(num_atoms):
        atom_type = features[i].item() # Convert tensor to int
        atom_symbol = atom_mapping[atom_type]
        new_atom_index = molecule.AddAtom(Chem.Atom(atom_symbol))
        atom_index_map[i] = new_atom_index

    # 'adjacency' 이용해서 molecule에 bond 추가
    for i in range(num_atoms):
        for j in range(i + 1, num_atoms):
            bond_type_idx = adjacency[i, j].item()  # Convert tensor to int

            if bond_type_idx == 0:
                continue

            bond_type = bond_mapping[bond_type_idx]

            new_i = atom_index_map[i]
            new_j = atom_index_map[j]

            if bond_type != 0:
                molecule.AddBond(new_i, new_j, bond_type)

    return molecule

3. df 받아서 list of dict 생성<br>

!!! 추출하고자 하는 속성명 df 내 컬럼명과 동일하게 바꾸기 !!!

In [80]:
def generate_pickle(df):
    mol_list = []
    for index, row in df.iterrows():
        smiles = row['Can_SMILES']
        mu = row['total_dipole_moment']     ### 꼭 맞춰서 바꾸기!!!!!!###

        adjacency, features = smiles_to_graph(smiles)

        molecule_dict = {
            'num_atom' : adjacency.shape[0],
            'atom_type' : torch.tensor(features, dtype=torch.int8),
            'bond_type' : torch.tensor(adjacency, dtype=torch.int8),
            'mu' : torch.tensor([mu], dtype=torch.float32) ### 꼭 맞춰서 바꾸기!!!!!!###
        }

        mol_list.append(molecule_dict)
        
    return mol_list

# Data load

### From PubChemQC

In [45]:
data = pd.read_csv('../1_PrepareDataset/PubChemQC_22348.csv')
data.columns

Index(['Unnamed: 0', 'i', 'molecular_formula', 'molecular_weight',
       'atom_count', 'heavy_atom_count', 'total_dipole_moment', 'homo', 'lumo',
       'gap', 'Isomeric_SMILES', 'Can_SMILES', 'delocal_charged', 'protic_N',
       'alcohol', 'thiol', 'PH', 'small_ring'],
      dtype='object')

In [46]:
df = data[['Can_SMILES', 'homo', 'lumo', 'total_dipole_moment']]
df.columns

Index(['Can_SMILES', 'homo', 'lumo', 'total_dipole_moment'], dtype='object')

In [47]:
df.head(3)

Unnamed: 0,Can_SMILES,homo,lumo,total_dipole_moment
0,C=COCC(C)CCCC(C)CC(C)C,-5.796025,1.02859,1.514198
1,CC(Cl)(C=O)CCl,-7.621909,-1.542886,1.474742
2,C=COCCOCCOCC,-5.54568,0.993216,2.92189


# bond, atom Dictionary  생성

In [48]:
# bond_mapping Dictionary 생성
bond_mapping = {"SINGLE": 1, "DOUBLE": 2, "TRIPLE": 3, "AROMATIC": 4}
bond_mapping.update({1:BondType.SINGLE, 2: BondType.DOUBLE, 3: BondType.TRIPLE, 4: BondType.AROMATIC})
print('bond_mapping: ', bond_mapping)

# atom_type(SMILES_CHARSET) 생성
atom_type = list()

for smi in df['Can_SMILES']:
    mol = Chem.MolFromSmiles(smi)
    for at in mol.GetAtoms():
        if at.GetSymbol() not in atom_type:
            atom_type.append(at.GetSymbol())
            
SMILE_CHARSET = atom_type.copy()
SMILE_CHARSET.sort()
print('SMILE_CHARSET: ', SMILE_CHARSET)

bond_mapping:  {'SINGLE': 1, 'DOUBLE': 2, 'TRIPLE': 3, 'AROMATIC': 4, 1: rdkit.Chem.rdchem.BondType.SINGLE, 2: rdkit.Chem.rdchem.BondType.DOUBLE, 3: rdkit.Chem.rdchem.BondType.TRIPLE, 4: rdkit.Chem.rdchem.BondType.AROMATIC}




SMILE_CHARSET:  ['C', 'Cl', 'F', 'H', 'N', 'O', 'P', 'S']


In [49]:
# atom_mapping_Dictionary 생성
SMILE_to_index = dict((c, i) for i, c in enumerate(SMILE_CHARSET))
index_to_SMILE = dict((i, c) for i, c in enumerate(SMILE_CHARSET))
atom_mapping = dict(SMILE_to_index)
atom_mapping.update(index_to_SMILE)
print('atom_mapping: ', atom_mapping)

atom_mapping:  {'C': 0, 'Cl': 1, 'F': 2, 'H': 3, 'N': 4, 'O': 5, 'P': 6, 'S': 7, 0: 'C', 1: 'Cl', 2: 'F', 3: 'H', 4: 'N', 5: 'O', 6: 'P', 7: 'S'}


# Train, Val, Test 분리

In [105]:
from sklearn.model_selection import train_test_split

shuffled_indices = df.sample(frac=1, random_state=100).index
shuffled_df = df.sample(frac=1, random_state=100)

train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# train, val, test indices 분리
train_indices, val_test_indices = train_test_split(shuffled_indices, 
                                                   train_size=train_ratio, 
                                                   test_size=val_ratio+test_ratio, 
                                                   random_state=100)
val_indices, test_indices = train_test_split(val_test_indices, 
                                             train_size=val_ratio/(val_ratio+test_ratio), 
                                             test_size=test_ratio/(val_ratio+test_ratio), 
                                             random_state=100)

train_index = [train_indices]
val_index = [val_indices]
test_index = [test_indices]

# train, val, test data 분리
train_data = df.loc[train_indices]
val_data = df.loc[val_indices]
test_data = df.loc[test_indices]

print("Train set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Test set size:", len(test_data))

Train set size: 17878
Validation set size: 2235
Test set size: 2235


In [106]:
print("Train indices length:", len(train_index))
print("Validation indices length:", len(val_index))
print("Test indices length:", len(test_index))
print('-'*50)
print("train_index[0] length:", len(train_index[0]))
print("val_index[0] length:", len(val_index[0]))
print("test_index[0] length:", len(test_index[0]))

Train indices length: 1
Validation indices length: 1
Test indices length: 1
--------------------------------------------------
train_index[0] length: 17878
val_index[0] length: 2235
test_index[0] length: 2235


# 각 데이터셋으로 Dict 생성

In [107]:
print('Generating train_list_of_dict...')
train_list_of_dict = generate_pickle(train_data)
print('train_list_of_dict generated')

print('Generating val_list_of_dict...')
val_list_of_dict = generate_pickle(val_data)
print('val_list_of_dict generated')

print('Generating test_list_of_dict...')
test_list_of_dict = generate_pickle(test_data)
print('test_list_of_dict generated')

Generating train_list_of_dict...


  'atom_type' : torch.tensor(features, dtype=torch.int8),
  'bond_type' : torch.tensor(adjacency, dtype=torch.int8),


train_list_of_dict generated
Generating val_list_of_dict...
val_list_of_dict generated
Generating test_list_of_dict...
test_list_of_dict generated




# pickle 저장

!!! 경로, 저장파일명 수정 !!!

In [108]:
# Define the file paths for saving the data and indices
file_path = "../3_PretrainDescriptor/training/pickle_data/3_dipole/"
train_data_path = file_path + "train.pickle"
val_data_path = file_path + "val.pickle"
test_data_path = file_path + "test.pickle"

# Save train data
with open(train_data_path, 'wb') as f:
    pickle.dump(train_list_of_dict, f)

# Save validation data
with open(val_data_path, 'wb') as f:
    pickle.dump(val_list_of_dict, f)

# Save test data
with open(test_data_path, 'wb') as f:
    pickle.dump(test_list_of_dict, f)

In [109]:
train_data.head(3)

Unnamed: 0,Can_SMILES,homo,lumo,total_dipole_moment
8427,O=c1n(Cl)c(=O)n(Cl)c(=O)n1Cl,-8.177021,-2.86808,7.4e-05
21953,CC=Nc1nc[nH]c1C#N,-6.408281,-1.545607,4.271928
20358,CC(=O)CCCn1nnc2c(cnn2C)c1=O,-6.710328,-1.69799,1.894273


In [110]:
train_list_of_dict[:3]

[{'num_atom': 12,
  'atom_type': tensor([5, 0, 4, 1, 0, 5, 4, 1, 0, 5, 4, 1], dtype=torch.int8),
  'bond_type': tensor([[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [2, 0, 4, 0, 0, 0, 0, 0, 0, 0, 4, 0],
          [0, 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 4, 0, 0, 2, 4, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 4, 0, 0, 1, 4, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 4, 0, 0, 2, 4, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0],
          [0, 4, 0, 0, 0, 0, 0, 0, 4, 0, 0, 1],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]], dtype=torch.int8),
  'mu': tensor([7.3580e-05])},
 {'num_atom': 10,
  'atom_type': tensor([0, 0, 4, 0, 4, 0, 4, 0, 0, 4], dtype=torch.int8),
  'bond_type': tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [1, 0, 2, 0, 0, 0, 0, 0, 0, 0],
          [0, 2, 0, 1, 0, 0, 0, 0, 0, 0],
          [0, 0, 1, 0, 4, 0, 0, 4