In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from rdkit import Chem
%matplotlib inline



In [2]:
CURRENT_PATH = '/Users/skcc10170/Desktop'
df_train = pd.read_csv(CURRENT_PATH + '/data/org/train_.csv')
df_valid = pd.read_csv(CURRENT_PATH + '/data/org/valid_.csv')
df_test = pd.read_csv(CURRENT_PATH + '/data/org/predict_input.csv')

In [3]:
df_train['idx'] = 'train'
df_valid['idx'] = 'valid'
df_test['idx'] = 'test'

df_tot = pd.concat([df_train, df_valid, df_test], sort=True).reset_index(drop=True)

In [4]:
df_tot['idx'].value_counts()

train    6680
valid    1669
test      927
Name: idx, dtype: int64

In [5]:
len(df_train), len(df_valid), len(df_test), len(df_tot)

(6680, 1669, 927, 9276)

# 1. df_tot로 변환해서 /data/new 에 저장

In [88]:
# MAX_LEN = 88개 원자가 최댓값
MAX_LEN = df_tot['num_atoms'].max()

In [89]:
LIST_SYMBOLS = list(set.union(*df_tot['SMILES'].apply(
    lambda x: set([atom.GetSymbol() for atom in Chem.MolFromSmiles(x).GetAtoms()])).values))

df_tot['atoms_list'] = df_tot['SMILES'].apply(lambda x: [atom.GetSymbol() for atom in Chem.MolFromSmiles(x).GetAtoms()])

temp_df =  df_tot['atoms_list'].apply(lambda x: pd.Series(x).value_counts())
for symbol in LIST_SYMBOLS:
    df_tot['num_atom_'+symbol] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_list']

df_tot['atoms_degree'] = df_tot['SMILES'].apply(lambda x: [atom.GetDegree() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_degree'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3,4,6]:
    df_tot['num_degree_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_degree']

df_tot['atoms_numH'] = df_tot['SMILES'].apply(lambda x: [atom.GetTotalNumHs() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_numH'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3]:
    df_tot['num_numH_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_numH']

df_tot['atoms_IV'] = df_tot['SMILES'].apply(lambda x: [atom.GetImplicitValence() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_IV'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3]:
    df_tot['IV_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_IV']

df_tot['atoms_isAromatic'] = df_tot['SMILES'].apply(lambda x: sum([atom.GetIsAromatic() for atom in Chem.MolFromSmiles(x).GetAtoms()]))

In [90]:
import os
os.getcwd()

'/Users/skcc10170/Desktop/GIT/Toxic_Molecule/bss/code/eda'

In [93]:
df_train_new = df_tot[df_tot['idx'] == 'train']
df_valid_new = df_tot[df_tot['idx'] == 'valid']
df_test_new = df_tot[df_tot['idx'] == 'test']

In [96]:
print(len(df_train_new) == len(df_train))
print(len(df_valid_new) == len(df_valid))
print(len(df_test_new) == len(df_test))

True
True
True


In [97]:
df_train_new.to_csv(CURRENT_PATH + '/data/new/train_.csv')
df_valid_new.to_csv(CURRENT_PATH + '/data/new/valid_.csv')
df_test_new.to_csv(CURRENT_PATH + '/data/new/test_.csv')

# new feature generate

In [102]:
from rdkit import Chem
from rdkit.Chem import Descriptors

for idx in range(1000):
    temp = df_tot['SMILES'][idx]
    temp = Chem.MolFromSmiles(temp)
#     print(temp.GetNumConformers())

In [130]:
# rdkit.Chem.Descriptors3D.Asphericity(temp)
[i.GetSymbol() for i in temp.GetAromaticAtoms()]

['N',
 'C',
 'C',
 'C',
 'C',
 'C',
 'N',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'N',
 'C',
 'C',
 'N',
 'C',
 'C',
 'N']

In [88]:
for i in Chem.rdmolops.FindAllSubgraphsOfLengthN(temp, 3):
    print(i[0], i[1], i[2])
    print(str(temp.GetBondWithIdx(i[0]).GetBeginAtom().GetSymbol()) + str(temp.GetBondWithIdx(i[0]).GetBeginAtomIdx()),
          str(temp.GetBondWithIdx(i[0]).GetEndAtom().GetSymbol()) + str(temp.GetBondWithIdx(i[0]).GetEndAtomIdx()),
          str(temp.GetBondWithIdx(i[1]).GetBeginAtom().GetSymbol()) + str(temp.GetBondWithIdx(i[1]).GetBeginAtomIdx()),
          str(temp.GetBondWithIdx(i[1]).GetEndAtom().GetSymbol()) + str(temp.GetBondWithIdx(i[1]).GetEndAtomIdx()),
          str(temp.GetBondWithIdx(i[2]).GetBeginAtom().GetSymbol()) + str(temp.GetBondWithIdx(i[2]).GetBeginAtomIdx()),
          str(temp.GetBondWithIdx(i[2]).GetEndAtom().GetSymbol()) + str(temp.GetBondWithIdx(i[2]).GetEndAtomIdx())
         )

0 1 34
C0 C1 C1 N2 N24 N2
0 1 2
C0 C1 C1 N2 N2 C3
1 34 23
C1 N2 N24 N2 C23 N24
1 34 2
C1 N2 N24 N2 N2 C3
1 2 3
C1 N2 N2 C3 C3 C4
2 3 35
N2 C3 C3 C4 C23 C4
2 3 4
N2 C3 C3 C4 C4 C5
2 3 34
N2 C3 C3 C4 N24 N2
2 34 23
N2 C3 N24 N2 C23 N24
3 35 23
C3 C4 C23 C4 C23 N24
3 35 22
C3 C4 C23 C4 C22 C23
3 35 4
C3 C4 C23 C4 C4 C5
3 4 19
C3 C4 C4 C5 C5 C20
3 4 5
C3 C4 C4 C5 C5 O6
4 19 20
C4 C5 C5 C20 C20 C21
4 19 5
C4 C5 C5 C20 C5 O6
4 19 35
C4 C5 C5 C20 C23 C4
4 5 6
C4 C5 C5 O6 O6 C7
4 5 35
C4 C5 C5 O6 C23 C4
4 35 23
C4 C5 C23 C4 C23 N24
4 35 22
C4 C5 C23 C4 C22 C23
5 6 36
C5 O6 O6 C7 C19 C7
5 6 7
C5 O6 O6 C7 C7 C8
5 6 19
C5 O6 O6 C7 C5 C20
5 19 20
C5 O6 C5 C20 C20 C21
6 36 18
O6 C7 C19 C7 C17 C19
6 36 7
O6 C7 C19 C7 C7 C8
6 7 8
O6 C7 C7 C8 C8 N9
7 8 9
C7 C8 C8 N9 N9 C10
7 8 36
C7 C8 C8 N9 C19 C7
7 36 18
C7 C8 C19 C7 C17 C19
8 9 16
C8 N9 N9 C10 C10 C17
8 9 10
C8 N9 N9 C10 C10 C11
9 16 18
N9 C10 C10 C17 C17 C19
9 16 17
N9 C10 C10 C17 C17 F18
9 16 10
N9 C10 C10 C17 C10 C11
9 10 12
N9 C10 C10 C11 C11 N