In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from rdkit import Chem
%matplotlib inline

import lightgbm as lgbm

# CV_baseline_add mol feature_normalize

In [5]:
from sklearn.model_selection import StratifiedKFold
n_splits=5
random_state = 2020
# num_test = len(X_test) # 927

CURRENT_PATH = '/Users/skcc10170/Desktop'
df_train = pd.read_csv(CURRENT_PATH + '/data/org/train_.csv')
df_valid = pd.read_csv(CURRENT_PATH + '/data/org/valid_.csv')
df_test = pd.read_csv(CURRENT_PATH + '/data/org/predict_input.csv')

In [6]:
df_train['idx'] = 'train'
df_valid['idx'] = 'valid'
df_test['idx'] = 'test'

df_tot = pd.concat([df_train, df_valid, df_test], sort=True).reset_index(drop=True)

In [7]:
# feature 추가 
df_tot['num_atoms'] = df_tot['SMILES'].apply(lambda x: Chem.MolFromSmiles(x).GetNumAtoms())

# MAX_LEN = 88개 원자가 최댓값
MAX_LEN = df_tot['num_atoms'].max()

LIST_SYMBOLS = list(set.union(*df_tot['SMILES'].apply(
    lambda x: set([atom.GetSymbol() for atom in Chem.MolFromSmiles(x).GetAtoms()])).values))

df_tot['atoms_list'] = df_tot['SMILES'].apply(lambda x: [atom.GetSymbol() for atom in Chem.MolFromSmiles(x).GetAtoms()])

temp_df =  df_tot['atoms_list'].apply(lambda x: pd.Series(x).value_counts())
for symbol in LIST_SYMBOLS:
    df_tot['num_atom_'+symbol] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_list']

df_tot['atoms_degree'] = df_tot['SMILES'].apply(lambda x: [atom.GetDegree() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_degree'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3,4,6]:
    df_tot['num_degree_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_degree']

df_tot['atoms_numH'] = df_tot['SMILES'].apply(lambda x: [atom.GetTotalNumHs() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_numH'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3]:
    df_tot['num_numH_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_numH']

df_tot['atoms_IV'] = df_tot['SMILES'].apply(lambda x: [atom.GetImplicitValence() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_IV'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3]:
    df_tot['IV_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_IV']

df_tot['atoms_isAromatic'] = df_tot['SMILES'].apply(lambda x: sum([atom.GetIsAromatic() for atom in Chem.MolFromSmiles(x).GetAtoms()]))

In [9]:
# cols.difference(cols_ecfp).difference(cols_fcfp).difference(cols_ptfp).difference(cols_atom).difference(cols_mol).difference(cols_IV)

In [10]:
# find all columns
cols = df_tot.columns

# smiles code
cols_smiles = 'SMILES'

# node-edge level (3 footprints)
cols_ecfp = list(cols[cols.str.contains('ecfp_')]) # ecfp 1024개
cols_fcfp = list(cols[cols.str.contains('fcfp_')]) # fcfp 1024개
cols_ptfp = list(cols[cols.str.contains('ptfp_')]) # ptfp 1024개

# graph level
cols_mol = ['MolWt', 'clogp', 'sa_score', 'qed']

# new feature
cols_atom = list(cols[cols.str.contains('num_')])
cols_IV = list(cols[cols.str.contains('IV_')])
cols_aromatic = ['atoms_isAromatic']


# input cols
cols_input1 = cols_ecfp + cols_fcfp + cols_ptfp
cols_input2 = cols_mol + cols_atom + cols_IV + cols_aromatic
cols_input = cols_input1 + cols_input2

# label
cols_label = 'label'

In [11]:
df_tot[cols_input2] = (df_tot[cols_input2] - df_tot[cols_input2].mean()) / df_tot[cols_input2].std()

In [12]:
df_train = df_tot[df_tot['idx'] == 'train']
df_valid = df_tot[df_tot['idx'] == 'valid']
df_test = df_tot[df_tot['idx'] == 'test']

In [13]:
from sklearn.model_selection import StratifiedKFold
n_splits=5
random_state = 2020
num_test = len(df_test) # 927

kfold = StratifiedKFold(n_splits=n_splits,
                        random_state=random_state,
                        shuffle=True)

x_train, y_train = df_train[cols_input].values, df_train[cols_label].values
x_valid, y_valid = df_valid[cols_input].values, df_valid[cols_label].values
x_test = df_test[cols_input].values

train_dataset_x = np.vstack([x_train, x_valid])
train_dataset_y = np.hstack([y_train, y_valid])

In [14]:
params = {
    'objective' :'binary',
    'learning_rate' : 0.012,
    'num_leaves' : 60,
    'feature_fraction': 0.64, 
    'bagging_fraction': 0.8, 
    'bagging_freq':1,
    'boosting_type' : 'dart',
    'metric': 'binary_logloss',
    'max_depth' : 12
}

In [15]:
from sklearn.metrics import log_loss, f1_score, accuracy_score
results = np.zeros((kfold.n_splits, num_test), dtype=np.float)

val_f1 = list()
val_acc = list()
for i, (train_idx, valid_idx) in enumerate(kfold.split(train_dataset_x, train_dataset_y)):
    
    x_train, y_train = train_dataset_x[train_idx], train_dataset_y[train_idx]
    x_valid, y_valid = train_dataset_x[valid_idx], train_dataset_y[valid_idx]
    
    d_train = lgbm.Dataset(x_train, y_train)
    d_valid = lgbm.Dataset(x_valid, y_valid)
    
#     model = lgb_model.train(num_iter, i)
    model = lgbm.train(params, d_train, 10000, valid_sets=[d_valid], verbose_eval=1000, early_stopping_rounds=50)

#     y_valid_pred = np.argmax(model.predict(x_valid, num_iteration=model.best_iteration), axis=1)
    y_valid_pred = (model.predict(x_valid, num_iteration=model.best_iteration) > 0.5).astype(int)

    f1 = f1_score(y_valid, y_valid_pred)
    acc = accuracy_score(y_valid, y_valid_pred)
    
    print(f'Fold {i} | Valid Accuracy: {acc}, F1 Score: {f1}')

    results[i] = model.predict(x_test, num_iteration=model.best_iteration)
    val_f1.append(f1)
    val_acc.append(acc)

print(f'Valid Accuracy: {np.mean(val_acc)}, F1 Score: {np.mean(val_f1)}')

Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.447497
Early stopping, best iteration is:
[1484]	valid_0's binary_logloss: 0.428823
Fold 0 | Valid Accuracy: 0.8138839018551766, F1 Score: 0.8337787279529664
Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.464886
Early stopping, best iteration is:
[1484]	valid_0's binary_logloss: 0.447287
Fold 1 | Valid Accuracy: 0.7953321364452424, F1 Score: 0.8120879120879121
Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.462895
Early stopping, best iteration is:
[1484]	valid_0's binary_logloss: 0.445733
Fold 2 | Valid Accuracy: 0.7890952666267226, F1 Score: 0.808487486398259
Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.447814
Early stopping, best iteration is:
[1484]	valid_0's binary_logloss: 0.431316
Fold 3 | Valid Accuracy: 0.799281006590773, F1 Score: 0

# CV_baseline_add mol feature2_normalize

- 지문데이터 1024개를 여러 형태로 뿔려서 만들어보기

In [16]:
CURRENT_PATH = '/Users/skcc10170/Desktop'
df_train = pd.read_csv(CURRENT_PATH + '/data/org/train_.csv')
df_valid = pd.read_csv(CURRENT_PATH + '/data/org/valid_.csv')
df_test = pd.read_csv(CURRENT_PATH + '/data/org/predict_input.csv')

In [17]:
df_train['idx'] = 'train'
df_valid['idx'] = 'valid'
df_test['idx'] = 'test'

df_tot = pd.concat([df_train, df_valid, df_test], sort=True).reset_index(drop=True)

In [18]:
# feature 추가 
df_tot['num_atoms'] = df_tot['SMILES'].apply(lambda x: Chem.MolFromSmiles(x).GetNumAtoms())

# MAX_LEN = 88개 원자가 최댓값
MAX_LEN = df_tot['num_atoms'].max()

LIST_SYMBOLS = list(set.union(*df_tot['SMILES'].apply(
    lambda x: set([atom.GetSymbol() for atom in Chem.MolFromSmiles(x).GetAtoms()])).values))

df_tot['atoms_list'] = df_tot['SMILES'].apply(lambda x: [atom.GetSymbol() for atom in Chem.MolFromSmiles(x).GetAtoms()])

temp_df =  df_tot['atoms_list'].apply(lambda x: pd.Series(x).value_counts())
for symbol in LIST_SYMBOLS:
    df_tot['num_atom_'+symbol] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_list']

df_tot['atoms_degree'] = df_tot['SMILES'].apply(lambda x: [atom.GetDegree() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_degree'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3,4,6]:
    df_tot['num_degree_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_degree']

df_tot['atoms_numH'] = df_tot['SMILES'].apply(lambda x: [atom.GetTotalNumHs() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_numH'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3]:
    df_tot['num_numH_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_numH']

df_tot['atoms_IV'] = df_tot['SMILES'].apply(lambda x: [atom.GetImplicitValence() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_IV'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3]:
    df_tot['IV_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_IV']

df_tot['atoms_isAromatic'] = df_tot['SMILES'].apply(lambda x: sum([atom.GetIsAromatic() for atom in Chem.MolFromSmiles(x).GetAtoms()]))

In [19]:
# cols.difference(cols_ecfp).difference(cols_fcfp).difference(cols_ptfp).difference(cols_atom).difference(cols_mol).difference(cols_IV)

In [20]:
# find all columns
cols = df_tot.columns

# smiles code
cols_smiles = 'SMILES'

# node-edge level (3 footprints)
cols_ecfp = list(cols[cols.str.contains('ecfp_')]) # ecfp 1024개
cols_fcfp = list(cols[cols.str.contains('fcfp_')]) # fcfp 1024개
cols_ptfp = list(cols[cols.str.contains('ptfp_')]) # ptfp 1024개

for i in range(int(len(cols_ecfp)/32)):
    df_tot['ecfpnew_' + str(i)] = df_tot[pd.Series(cols_ecfp).apply(lambda x: int(x.split('_')[1])
                                                                   ).sort_values().apply(lambda x: 'ecfp_'+ str(x)
                                                                                        ).values[int(32*i):int(32*(i+1))]].mean(axis=1)
cols_ecfpnew = list(cols[cols.str.contains('ecfpnew_')]) # ecfp 1024개

for i in range(int(len(cols_fcfp)/32)):
    df_tot['fcfpnew_' + str(i)] = df_tot[pd.Series(cols_fcfp).apply(lambda x: int(x.split('_')[1])
                                                                   ).sort_values().apply(lambda x: 'fcfp_'+ str(x)
                                                                                        ).values[int(32*i):int(32*(i+1))]].mean(axis=1)
cols_fcfpnew = list(cols[cols.str.contains('fcfpnew_')]) # ecfp 1024개

for i in range(int(len(cols_ptfp)/32)):
    df_tot['ptfpnew_' + str(i)] = df_tot[pd.Series(cols_ptfp).apply(lambda x: int(x.split('_')[1])
                                                                   ).sort_values().apply(lambda x: 'ptfp_'+ str(x)
                                                                                        ).values[int(32*i):int(32*(i+1))]].mean(axis=1)
cols_ptfpnew = list(cols[cols.str.contains('ptfpnew_')]) # ecfp 1024개


    
# graph level
cols_mol = ['MolWt', 'clogp', 'sa_score', 'qed']

# new feature
cols_atom = list(cols[cols.str.contains('num_')])
cols_IV = list(cols[cols.str.contains('IV_')])
cols_aromatic = ['atoms_isAromatic']


# input cols
cols_input1 = cols_ecfp + cols_fcfp + cols_ptfp + cols_ecfpnew + cols_fcfpnew + cols_ptfpnew
cols_input2 = cols_mol + cols_atom + cols_IV + cols_aromatic
cols_input = cols_input1 + cols_input2

# label
cols_label = 'label'

In [21]:
df_tot[cols_input2] = (df_tot[cols_input2] - df_tot[cols_input2].mean()) / df_tot[cols_input2].std()

In [22]:
df_train = df_tot[df_tot['idx'] == 'train']
df_valid = df_tot[df_tot['idx'] == 'valid']
df_test = df_tot[df_tot['idx'] == 'test']

In [23]:
from sklearn.model_selection import StratifiedKFold
n_splits=5
random_state = 2020
num_test = len(df_test) # 927

kfold = StratifiedKFold(n_splits=n_splits,
                        random_state=random_state,
                        shuffle=True)

x_train, y_train = df_train[cols_input].values, df_train[cols_label].values
x_valid, y_valid = df_valid[cols_input].values, df_valid[cols_label].values
x_test = df_test[cols_input].values

train_dataset_x = np.vstack([x_train, x_valid])
train_dataset_y = np.hstack([y_train, y_valid])

In [24]:
params = {
    'objective' :'binary',
    'learning_rate' : 0.012,
    'num_leaves' : 60,
    'feature_fraction': 0.64, 
    'bagging_fraction': 0.8, 
    'bagging_freq':1,
    'boosting_type' : 'dart',
    'metric': 'binary_logloss',
    'max_depth' : 12
}

In [25]:
from sklearn.metrics import log_loss, f1_score, accuracy_score
results = np.zeros((kfold.n_splits, num_test), dtype=np.float)

val_f1 = list()
val_acc = list()
for i, (train_idx, valid_idx) in enumerate(kfold.split(train_dataset_x, train_dataset_y)):
    
    x_train, y_train = train_dataset_x[train_idx], train_dataset_y[train_idx]
    x_valid, y_valid = train_dataset_x[valid_idx], train_dataset_y[valid_idx]
    
    d_train = lgbm.Dataset(x_train, y_train)
    d_valid = lgbm.Dataset(x_valid, y_valid)
    
#     model = lgb_model.train(num_iter, i)
    model = lgbm.train(params, d_train, 10000, valid_sets=[d_valid], verbose_eval=1000, early_stopping_rounds=50)

#     y_valid_pred = np.argmax(model.predict(x_valid, num_iteration=model.best_iteration), axis=1)
    y_valid_pred = (model.predict(x_valid, num_iteration=model.best_iteration) > 0.5).astype(int)

    f1 = f1_score(y_valid, y_valid_pred)
    acc = accuracy_score(y_valid, y_valid_pred)
    
    print(f'Fold {i} | Valid Accuracy: {acc}, F1 Score: {f1}')

    results[i] = model.predict(x_test, num_iteration=model.best_iteration)
    val_f1.append(f1)
    val_acc.append(acc)

print(f'Valid Accuracy: {np.mean(val_acc)}, F1 Score: {np.mean(val_f1)}')

Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.447497
Early stopping, best iteration is:
[1484]	valid_0's binary_logloss: 0.428823
Fold 0 | Valid Accuracy: 0.8138839018551766, F1 Score: 0.8337787279529664
Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.464886
Early stopping, best iteration is:
[1484]	valid_0's binary_logloss: 0.447287
Fold 1 | Valid Accuracy: 0.7953321364452424, F1 Score: 0.8120879120879121
Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.462895
Early stopping, best iteration is:
[1484]	valid_0's binary_logloss: 0.445733
Fold 2 | Valid Accuracy: 0.7890952666267226, F1 Score: 0.808487486398259
Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.447814
Early stopping, best iteration is:
[1484]	valid_0's binary_logloss: 0.431316
Fold 3 | Valid Accuracy: 0.799281006590773, F1 Score: 0