In [98]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from rdkit import Chem
%matplotlib inline

In [99]:
CURRENT_PATH = '/Users/skcc10170/Desktop'
df_train = pd.read_csv(CURRENT_PATH + '/data/org/train_.csv')
df_valid = pd.read_csv(CURRENT_PATH + '/data/org/valid_.csv')
df_test = pd.read_csv(CURRENT_PATH + '/data/org/predict_input.csv')

In [100]:
df_train['idx'] = 'train'
df_valid['idx'] = 'valid'
df_test['idx'] = 'test'

df_tot = pd.concat([df_train, df_valid, df_test], sort=True).reset_index(drop=True)

In [101]:
df_tot['idx'].value_counts()

train    6680
valid    1669
test      927
Name: idx, dtype: int64

In [102]:
len(df_train), len(df_valid), len(df_test), len(df_tot)

(6680, 1669, 927, 9276)

# 1. df_tot로 변환해서 /data/new 에 저장

In [88]:
# feature 추가 
df_tot['num_atoms'] = df_tot['SMILES'].apply(lambda x: Chem.MolFromSmiles(x).GetNumAtoms())

# MAX_LEN = 88개 원자가 최댓값
MAX_LEN = df_tot['num_atoms'].max()

In [89]:
LIST_SYMBOLS = list(set.union(*df_tot['SMILES'].apply(
    lambda x: set([atom.GetSymbol() for atom in Chem.MolFromSmiles(x).GetAtoms()])).values))

df_tot['atoms_list'] = df_tot['SMILES'].apply(lambda x: [atom.GetSymbol() for atom in Chem.MolFromSmiles(x).GetAtoms()])

temp_df =  df_tot['atoms_list'].apply(lambda x: pd.Series(x).value_counts())
for symbol in LIST_SYMBOLS:
    df_tot['num_atom_'+symbol] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_list']

df_tot['atoms_degree'] = df_tot['SMILES'].apply(lambda x: [atom.GetDegree() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_degree'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3,4,6]:
    df_tot['num_degree_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_degree']

df_tot['atoms_numH'] = df_tot['SMILES'].apply(lambda x: [atom.GetTotalNumHs() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_numH'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3]:
    df_tot['num_numH_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_numH']

df_tot['atoms_IV'] = df_tot['SMILES'].apply(lambda x: [atom.GetImplicitValence() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_IV'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3]:
    df_tot['IV_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_IV']

df_tot['atoms_isAromatic'] = df_tot['SMILES'].apply(lambda x: sum([atom.GetIsAromatic() for atom in Chem.MolFromSmiles(x).GetAtoms()]))

In [90]:
import os
os.getcwd()

'/Users/skcc10170/Desktop/GIT/Toxic_Molecule/bss/code/eda'

In [93]:
df_train_new = df_tot[df_tot['idx'] == 'train']
df_valid_new = df_tot[df_tot['idx'] == 'valid']
df_test_new = df_tot[df_tot['idx'] == 'test']

In [96]:
print(len(df_train_new) == len(df_train))
print(len(df_valid_new) == len(df_valid))
print(len(df_test_new) == len(df_test))

True
True
True


In [97]:
df_train_new.to_csv(CURRENT_PATH + '/data/new/train_.csv')
df_valid_new.to_csv(CURRENT_PATH + '/data/new/valid_.csv')
df_test_new.to_csv(CURRENT_PATH + '/data/new/test_.csv')

# subgraph 찾기

In [103]:
df_tot

Unnamed: 0,MolWt,SMILES,clogp,ecfp_0,ecfp_1,ecfp_10,ecfp_100,ecfp_1000,ecfp_1001,ecfp_1002,...,ptfp_992,ptfp_993,ptfp_994,ptfp_995,ptfp_996,ptfp_997,ptfp_998,ptfp_999,qed,sa_score
0,476.523,Fc1ccc(C(=O)C2CCN(CCCOc3ccc(cc3)c4oc5ccccc5n4)...,6.13680,0,0,1,0,0,0,0,...,0,1,0,1,0,0,1,0,0.225978,2.309377
1,331.891,CC(C)n1c(CNC2CCCC2)nc(C)c1c3ccc(Cl)cc3,5.12502,0,1,0,0,0,0,0,...,0,1,1,1,0,0,1,0,0.823148,2.378411
2,488.375,COCC1=C([C@@H](c2ccc(Cl)c(Cl)c2)n3nccc3N1)C(=O...,4.76542,0,0,0,0,0,0,0,...,0,1,0,1,1,0,1,1,0.557406,3.892250
3,318.569,CCCCCCCCc1cccc(CCCCCCCC)[n+]1C,6.31710,0,0,0,0,0,0,0,...,0,1,1,1,0,0,1,0,0.279106,2.575898
4,485.032,CC[C@H]1CN([C@H](C)CN1C2CCN(CC2)C(=O)c3ccc(Cl)...,3.13122,0,0,0,0,0,0,0,...,0,1,0,1,1,0,1,0,0.700045,3.497750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9271,533.552,COc1ccc2nccc([C@@H](O)CC[C@@H]3CCN(CCSc4ccc[se...,3.92900,0,1,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0.301970,4.097021
9272,538.530,C[C@@]1(CC[C@](C)(C1)c2nc(c3ccc(cc3)C(=O)Nc4cc...,5.17710,0,0,0,0,0,0,0,...,0,1,0,1,1,0,1,0,0.318701,3.843137
9273,552.679,Cc1noc(n1)c2cccc(CN3CCN(CC3)C(=O)c4ccc(C[C@@H]...,3.79472,0,1,0,1,0,0,0,...,0,1,0,1,0,0,1,0,0.340064,3.729796
9274,572.578,CCN(CC)Cc1cc(\C=N\N=C(/N)\CC(O)c2cc3c(F)cc(F)c...,6.65220,0,1,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0.068820,3.708900
