In [1]:
# # 레이블 약 54.4(독성) : 45.5
# df_tot['label'].value_counts(normalize=True) * 100

# for c in cols_mol:
#     func_histplot(c)

# col_smiles, cols_ecfp, cols_fcfp, cols_ptfp, cols_mol, col_label = classify_cols(df_tot)
# cols_atom = df_tot.columns[df_tot.columns.str.contains('num_atom')]



In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from rdkit import Chem
%matplotlib inline

CURRENT_PATH = '/Users/skcc10170/Desktop'

def load_data(path):
    df_train = pd.read_csv(path + '/dataset/train_.csv')
    df_test = pd.read_csv(path + '/dataset/valid_.csv')
    
    df_train = df_train.rename(columns={'Unnamed: 0' : "idx"})
    df_test = df_test.rename(columns={'Unnamed: 0' : "idx"})
    
    df_all = df_train.append(df_test).reset_index(drop=True)
    
    return df_all, df_train, df_test


def classify_cols(df):
    cols = df.columns

    # smiles code
    col_smiles = ['SMILES']

    # node-edge level (3 footprints)
    col_ecfp = list(cols[cols.str.contains('ecfp_')]) # ecfp 1024개
    col_fcfp = list(cols[cols.str.contains('fcfp_')]) # fcfp 1024개
    col_ptfp = list(cols[cols.str.contains('ptfp_')]) # ptfp 1024개

    # graph level
    col_mol = ['MolWt', 'clogp', 'sa_score', 'qed']

    # input cols
    col_input = col_ecfp + col_fcfp + col_ptfp + col_mol # col_smiles 제외

    # label
    col_label = ['label']
    
    return col_smiles[0], col_ecfp, col_fcfp, col_ptfp, col_mol, col_label[0]


def func_histplot(col):
    plt.hist(df_tot.loc[df_tot['label']==0, col], bins=100, alpha=0.5, label='no toxic')
    plt.hist(df_tot.loc[df_tot['label']==1, col], bins=100, alpha=0.5, label='toxic')
    plt.legend()
    plt.title(col)
    plt.figure()



In [3]:
df_tot, df_train, df_test = load_data(CURRENT_PATH)

MAX_LEN = df_tot['SMILES'].apply(lambda x: Chem.MolFromSmiles(x).GetNumAtoms()).max()
LIST_SYMBOLS = list(set.union(*df_tot['SMILES'].apply(
    lambda x: set([atom.GetSymbol() for atom in Chem.MolFromSmiles(x).GetAtoms()])).values))

df_tot['num_atoms'] = df_tot['SMILES'].apply(lambda x: Chem.MolFromSmiles(x).GetNumAtoms())
df_tot['atoms_list'] = df_tot['SMILES'].apply(lambda x: [atom.GetSymbol() for atom in Chem.MolFromSmiles(x).GetAtoms()])

temp_df =  df_tot['atoms_list'].apply(lambda x: pd.Series(x).value_counts())
for symbol in LIST_SYMBOLS:
    df_tot['num_atom_'+symbol] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_list']

df_tot['atoms_degree'] = df_tot['SMILES'].apply(lambda x: [atom.GetDegree() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_degree'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3,4,6]:
    df_tot['num_degree_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_degree']

df_tot['atoms_numH'] = df_tot['SMILES'].apply(lambda x: [atom.GetTotalNumHs() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_numH'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3]:
    df_tot['num_numH_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_numH']

df_tot['atoms_IV'] = df_tot['SMILES'].apply(lambda x: [atom.GetImplicitValence() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_IV'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3]:
    df_tot['IV_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_IV']

df_tot['atoms_isAromatic'] = df_tot['SMILES'].apply(lambda x: sum([atom.GetIsAromatic() for atom in Chem.MolFromSmiles(x).GetAtoms()]))

In [4]:
df_train2 = df_tot[df_tot['idx'].apply(lambda x: x in df_train['idx'].values)]
df_test2 = df_tot[df_tot['idx'].apply(lambda x: x in df_test['idx'].values)]

In [6]:
import os
os.getcwd()

'/Users/skcc10170/Desktop/GIT/Toxic_Molecule/bss/code/eda'

In [7]:
CURRENT_PATH

'/Users/skcc10170/Desktop'

In [8]:
df_train2.to_csv(CURRENT_PATH + '/dataset/train2_.csv')
df_test2.to_csv(CURRENT_PATH + '/dataset/test2_.csv')

In [5]:
df_tot

Unnamed: 0,idx,SMILES,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,num_degree_6,num_numH_0,num_numH_1,num_numH_2,num_numH_3,IV_0,IV_1,IV_2,IV_3,atoms_isAromatic
0,5147,CNC(=O)c1ncn2c1COc3c(CCN4CCN(CC4)c5cccc6nc(C)c...,0,0,0,0,0,0,0,0,...,0.0,17.0,10.0,7.0,2.0,17.0,10.0,7.0,2.0,21
1,5243,CN(C1CCN(Cc2ccc(cc2)C(F)(F)F)CC1)C(=O)Cc3ccc(c...,0,0,0,0,0,0,0,0,...,0.0,17.0,8.0,6.0,2.0,17.0,8.0,6.0,2.0,12
2,4735,COc1cc(ccc1n2cnc(C)c2)c3cn(CC(=O)N(C4CCCCC4)c5...,0,0,1,0,1,0,0,0,...,0.0,15.0,12.0,6.0,2.0,15.0,12.0,6.0,2.0,22
3,6527,CCOC(=O)[C@@H]1CC[C@@H](CC1)N2CC(C2)NC(=O)CNc3...,0,0,0,0,0,0,0,0,...,0.0,19.0,8.0,8.0,2.0,21.0,6.0,8.0,2.0,9
4,5803,COC1(CCOCC1)c2ccc(NC(=O)C3=CC(=O)c4cc(F)cc(c4O...,0,0,0,0,0,0,0,0,...,0.0,22.0,7.0,4.0,4.0,22.0,7.0,4.0,4.0,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8344,6396,CCc1cc2OCC(=O)Nc2nc1CNC34CCC(C[C@]5(O)CN6C(=O)...,0,0,0,0,0,0,0,0,...,0.0,21.0,7.0,10.0,1.0,21.0,7.0,10.0,1.0,16
8345,4106,Cc1ncoc1c2nnc(SCCCN3C[C@H]4C[C@]4(C3)c5ccc(cc5...,0,0,0,0,0,0,0,0,...,0.0,15.0,6.0,6.0,5.0,16.0,5.0,6.0,5.0,16
8346,3973,Cc1c(CCN2CCN(CC2)C(=O)Cc3cnc(cn3)n4cnnn4)ccc5C...,0,0,0,0,0,0,0,0,...,0.0,19.0,5.0,8.0,1.0,19.0,5.0,8.0,1.0,17
8347,7031,COc1cc(ncn1)N2C(=O)N(C(=O)C23CCN(Cc4ncccc4C)CC...,0,0,0,0,0,0,0,0,...,0.0,22.0,16.0,5.0,2.0,23.0,15.0,5.0,2.0,29
