In [16]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from rdkit import Chem
%matplotlib inline

import lightgbm as lgbm

from sklearn.model_selection import StratifiedKFold

In [17]:
CURRENT_PATH = '/Users/skcc10170/Desktop'
df_train = pd.read_csv(CURRENT_PATH + '/data/org/train_.csv')
df_valid = pd.read_csv(CURRENT_PATH + '/data/org/valid_.csv')
df_test = pd.read_csv(CURRENT_PATH + '/data/org/predict_input.csv')

In [18]:
cols = df_train.columns

# smiles code
cols_smiles = 'SMILES'

# node-edge level (3 footprints)
cols_ecfp = list(cols[cols.str.contains('ecfp_')]) # ecfp 1024개
cols_fcfp = list(cols[cols.str.contains('fcfp_')]) # fcfp 1024개
cols_ptfp = list(cols[cols.str.contains('ptfp_')]) # ptfp 1024개

# graph level
cols_mol = ['MolWt', 'clogp', 'sa_score', 'qed']

# input cols
cols_input = cols_ecfp + cols_fcfp + cols_ptfp + cols_mol

# label
cols_label = 'label'

In [19]:
X_train, y_train = df_train[cols_input].values, df_train[cols_label].values
X_valid, y_valid = df_valid[cols_input].values, df_valid[cols_label].values
X_test = df_test[cols_input].values

In [90]:
params = {
    'objective' :'binary',
    'learning_rate' : 0.02,
    'num_leaves' : 60,
    'feature_fraction': 0.9, 
    'bagging_fraction': 0.8, 
    'bagging_freq':1,
    'boosting_type' : 'dart',
    'metric': 'binary_logloss',
    'max_depth' : 32
}

In [91]:
d_train = lgbm.Dataset(X_train, y_train)
d_valid = lgbm.Dataset(X_valid, y_valid)

In [92]:
model = lgbm.train(params, d_train, 30000, valid_sets=[d_valid], verbose_eval=50, early_stopping_rounds=50)

Training until validation scores don't improve for 50 rounds.
[50]	valid_0's binary_logloss: 0.576968
[100]	valid_0's binary_logloss: 0.553122
[150]	valid_0's binary_logloss: 0.535866
[200]	valid_0's binary_logloss: 0.514686
[250]	valid_0's binary_logloss: 0.499376
[300]	valid_0's binary_logloss: 0.484374
[350]	valid_0's binary_logloss: 0.47652
[400]	valid_0's binary_logloss: 0.462813
[450]	valid_0's binary_logloss: 0.451713
[500]	valid_0's binary_logloss: 0.446032
[550]	valid_0's binary_logloss: 0.44097
[600]	valid_0's binary_logloss: 0.440603
[650]	valid_0's binary_logloss: 0.435151
[700]	valid_0's binary_logloss: 0.432193
[750]	valid_0's binary_logloss: 0.43013
[800]	valid_0's binary_logloss: 0.427875
[850]	valid_0's binary_logloss: 0.42477
[900]	valid_0's binary_logloss: 0.421993
[950]	valid_0's binary_logloss: 0.419679
[1000]	valid_0's binary_logloss: 0.418045
[1050]	valid_0's binary_logloss: 0.4157
[1100]	valid_0's binary_logloss: 0.414157
[1150]	valid_0's binary_logloss: 0.41337

In [94]:
pred_train = model.predict(X_train, num_iteration=model.best_iteration)
pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
print(((pred_train > 0.5) == (y_train)).sum() / len(y_train))
print(((pred_valid > 0.5) == (y_valid)).sum() / len(y_valid))

0.9851796407185629
0.8106650689035351


In [95]:
# y_valid_pred =  model.predict(X_valid, num_iteration=model.best_iteration)
# bce_loss = log_loss(y_valid, y_valid_pred)
# f1_loss = 1 - f1_score(y_valid, (y_valid_pred > 0.5))
# eval_loss = bce_loss + f1_loss
# eval_loss

# CV_baseline

In [97]:
CURRENT_PATH = '/Users/skcc10170/Desktop'
df_train = pd.read_csv(CURRENT_PATH + '/data/org/train_.csv')
df_valid = pd.read_csv(CURRENT_PATH + '/data/org/valid_.csv')
df_test = pd.read_csv(CURRENT_PATH + '/data/org/predict_input.csv')

# find all columns
cols = df_train.columns

# smiles code
cols_smiles = 'SMILES'

# node-edge level (3 footprints)
cols_ecfp = list(cols[cols.str.contains('ecfp_')]) # ecfp 1024개
cols_fcfp = list(cols[cols.str.contains('fcfp_')]) # fcfp 1024개
cols_ptfp = list(cols[cols.str.contains('ptfp_')]) # ptfp 1024개

# graph level
cols_mol = ['MolWt', 'clogp', 'sa_score', 'qed']

# input cols
cols_input = cols_ecfp + cols_fcfp + cols_ptfp + cols_mol

# label
cols_label = 'label'


n_splits=5
random_state = 2020
num_test = len(df_test) # 927

kfold = StratifiedKFold(n_splits=n_splits,
                        random_state=random_state,
                        shuffle=True)

x_train, y_train = df_train[cols_input].values, df_train[cols_label].values
x_valid, y_valid = df_valid[cols_input].values, df_valid[cols_label].values
x_test = df_test[cols_input].values

train_dataset_x = np.vstack([x_train, x_valid])
train_dataset_y = np.hstack([y_train, y_valid])

In [101]:
# params = {
#     'objective' :'binary',
#     'learning_rate' : 0.012,
#     'num_leaves' : 60,
#     'feature_fraction': 0.64, 
#     'bagging_fraction': 0.8, 
#     'bagging_freq':1,
#     'boosting_type' : 'dart',
#     'metric': 'binary_logloss',
#     'max_depth' : 12
# }

params = {
    'objective' :'binary',
    'learning_rate' : 0.02,
    'num_leaves' : 60,
    'feature_fraction': 0.9, 
    'bagging_fraction': 0.8, 
    'bagging_freq':1,
    'boosting_type' : 'dart',
    'metric': 'binary_logloss',
    'max_depth' : 32
}

In [102]:
from sklearn.metrics import log_loss, f1_score, accuracy_score

num_test = len(x_test)
results = np.zeros((kfold.n_splits, num_test), dtype=np.float)

val_f1 = list()
val_acc = list()
for i, (train_idx, valid_idx) in enumerate(kfold.split(train_dataset_x, train_dataset_y)):
    
    x_train, y_train = train_dataset_x[train_idx], train_dataset_y[train_idx]
    x_valid, y_valid = train_dataset_x[valid_idx], train_dataset_y[valid_idx]
    
    d_train = lgbm.Dataset(x_train, y_train)
    d_valid = lgbm.Dataset(x_valid, y_valid)
    
#     model = lgb_model.train(num_iter, i)
    model = lgbm.train(params, d_train, 30000, valid_sets=[d_valid], verbose_eval=1000, early_stopping_rounds=50)

#     y_valid_pred = np.argmax(model.predict(x_valid, num_iteration=model.best_iteration), axis=1)
    y_valid_pred = (model.predict(x_valid, num_iteration=model.best_iteration) > 0.5).astype(int)

    f1 = f1_score(y_valid, y_valid_pred)
    acc = accuracy_score(y_valid, y_valid_pred)
    
    print(f'Fold {i} | Valid Accuracy: {acc}, F1 Score: {f1}')

    results[i] = model.predict(x_test, num_iteration=model.best_iteration)
    val_f1.append(f1)
    val_acc.append(acc)

print(f'Valid Accuracy: {np.mean(val_acc)}, F1 Score: {np.mean(val_f1)}')

Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[552]	valid_0's binary_logloss: 0.441011
Fold 0 | Valid Accuracy: 0.8090963494913226, F1 Score: 0.8300479488545552
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[560]	valid_0's binary_logloss: 0.458541
Fold 1 | Valid Accuracy: 0.7833632555356074, F1 Score: 0.8006607929515418
Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.436031
Early stopping, best iteration is:
[1598]	valid_0's binary_logloss: 0.424077
Fold 2 | Valid Accuracy: 0.799281006590773, F1 Score: 0.8174386920980927
Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.420022
Early stopping, best iteration is:
[1102]	valid_0's binary_logloss: 0.417214
Fold 3 | Valid Accuracy: 0.7998801677651288, F1 Score: 0.8213903743315509
Training until validation scores don't improve for 50 rounds.
[1000

# CV_baseline_nomarlize

In [145]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from rdkit import Chem
%matplotlib inline

import lightgbm as lgbm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, f1_score, accuracy_score


CURRENT_PATH = '/Users/skcc10170/Desktop'
df_train = pd.read_csv(CURRENT_PATH + '/data/org/train_.csv')
df_valid = pd.read_csv(CURRENT_PATH + '/data/org/valid_.csv')
df_test = pd.read_csv(CURRENT_PATH + '/data/org/predict_input.csv')

# find all columns
cols = df_train.columns

# smiles code
cols_smiles = 'SMILES'

# node-edge level (3 footprints)
cols_ecfp = list(cols[cols.str.contains('ecfp_')]) # ecfp 1024개
cols_fcfp = list(cols[cols.str.contains('fcfp_')]) # fcfp 1024개
cols_ptfp = list(cols[cols.str.contains('ptfp_')]) # ptfp 1024개

# graph level
cols_mol = ['MolWt', 'clogp', 'sa_score', 'qed']

# input cols
cols_input = cols_ecfp + cols_fcfp + cols_ptfp + cols_mol

# label
cols_label = 'label'

df_tot = pd.concat([df_train, df_valid, df_test], sort=True).reset_index(drop=True)

n_splits=5
random_state = 2020
num_test = len(df_test) # 927

kfold = StratifiedKFold(n_splits=n_splits,
                        random_state=random_state,
                        shuffle=True)

x_train, y_train = df_train[cols_input].values, df_train[cols_label].values
x_valid, y_valid = df_valid[cols_input].values, df_valid[cols_label].values
x_test = df_test[cols_input].values

train_dataset_x = np.vstack([x_train, x_valid])
train_dataset_y = np.hstack([y_train, y_valid])

params = {
    'objective' :'binary',
    'learning_rate' : 0.02,
    'num_leaves' : 60,
    'feature_fraction': 0.9, 
    'bagging_fraction': 0.8, 
    'bagging_freq':1,
    'boosting_type' : 'dart',
    'metric': 'binary_logloss',
    'max_depth' : 32
}

results = np.zeros((kfold.n_splits, num_test), dtype=np.float)

val_f1 = list()
val_acc = list()
for i, (train_idx, valid_idx) in enumerate(kfold.split(train_dataset_x, train_dataset_y)):
    
    x_train, y_train = train_dataset_x[train_idx], train_dataset_y[train_idx]
    x_valid, y_valid = train_dataset_x[valid_idx], train_dataset_y[valid_idx]
    
    # 트레인셋 기준으로 기준값 추출
    tr_mean, tr_std = x_train[:, -4:].mean(axis=0), x_train[:, -4:].std(axis=0)
    
    # 노말라이즈 적용
    x_train[:,-4:] = (x_train[:,-4:] - tr_mean) / tr_std
    x_valid[:,-4:] = (x_valid[:,-4:] - tr_mean) / tr_std
    x_test[:,-4:]  = (x_test[:,-4:] - tr_mean) / tr_std
    
    d_train = lgbm.Dataset(x_train, y_train)
    d_valid = lgbm.Dataset(x_valid, y_valid)
    
    model = lgbm.train(params, d_train, 30000, valid_sets=[d_valid], verbose_eval=1000, early_stopping_rounds=50)

    y_valid_pred = (model.predict(x_valid, num_iteration=model.best_iteration) > 0.5).astype(int)

    f1 = f1_score(y_valid, y_valid_pred)
    acc = accuracy_score(y_valid, y_valid_pred)
    
    print(f'Fold {i} | Valid Accuracy: {acc}, F1 Score: {f1}')

    results[i] = model.predict(x_test, num_iteration=model.best_iteration)
    val_f1.append(f1)
    val_acc.append(acc)

print(f'Valid Accuracy: {np.mean(val_acc)}, F1 Score: {np.mean(val_f1)}')

Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.419536
Early stopping, best iteration is:
[1423]	valid_0's binary_logloss: 0.408571
Fold 0 | Valid Accuracy: 0.8168761220825853, F1 Score: 0.8363636363636363
Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.43488
Early stopping, best iteration is:
[1482]	valid_0's binary_logloss: 0.4241
Fold 1 | Valid Accuracy: 0.7971274685816876, F1 Score: 0.812603648424544
Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.433543
Early stopping, best iteration is:
[1462]	valid_0's binary_logloss: 0.421911
Fold 2 | Valid Accuracy: 0.8040742959856201, F1 Score: 0.821409066084107
Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.423133
Early stopping, best iteration is:
[1229]	valid_0's binary_logloss: 0.416347
Fold 3 | Valid Accuracy: 0.8034751348112642, F1 Score: 0.82

# CV_baseline_add mol feature_normalize

In [146]:
from sklearn.model_selection import StratifiedKFold
n_splits=5
random_state = 2020
num_test = len(df_test) # 927

CURRENT_PATH = '/Users/skcc10170/Desktop'
df_train = pd.read_csv(CURRENT_PATH + '/data/org/train_.csv')
df_valid = pd.read_csv(CURRENT_PATH + '/data/org/valid_.csv')
df_test = pd.read_csv(CURRENT_PATH + '/data/org/predict_input.csv')

In [147]:
df_train['idx'] = 'train'
df_valid['idx'] = 'valid'
df_test['idx'] = 'test'

df_tot = pd.concat([df_train, df_valid, df_test], sort=True).reset_index(drop=True)

In [148]:
# feature 추가 
df_tot['num_atoms'] = df_tot['SMILES'].apply(lambda x: Chem.MolFromSmiles(x).GetNumAtoms())

# MAX_LEN = 88개 원자가 최댓값
MAX_LEN = df_tot['num_atoms'].max()

LIST_SYMBOLS = list(set.union(*df_tot['SMILES'].apply(
    lambda x: set([atom.GetSymbol() for atom in Chem.MolFromSmiles(x).GetAtoms()])).values))

df_tot['atoms_list'] = df_tot['SMILES'].apply(lambda x: [atom.GetSymbol() for atom in Chem.MolFromSmiles(x).GetAtoms()])

temp_df =  df_tot['atoms_list'].apply(lambda x: pd.Series(x).value_counts())
for symbol in LIST_SYMBOLS:
    df_tot['num_atom_'+symbol] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_list']

df_tot['atoms_degree'] = df_tot['SMILES'].apply(lambda x: [atom.GetDegree() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_degree'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3,4,6]:
    df_tot['num_degree_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_degree']

df_tot['atoms_numH'] = df_tot['SMILES'].apply(lambda x: [atom.GetTotalNumHs() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_numH'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3]:
    df_tot['num_numH_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_numH']

df_tot['atoms_IV'] = df_tot['SMILES'].apply(lambda x: [atom.GetImplicitValence() for atom in Chem.MolFromSmiles(x).GetAtoms()])
temp_df = df_tot['atoms_IV'].apply(lambda x: pd.Series(x).value_counts())
for symbol in [0,1,2,3]:
    df_tot['IV_'+str(symbol)] = temp_df[symbol].replace(np.NaN, 0)
del df_tot['atoms_IV']

df_tot['atoms_isAromatic'] = df_tot['SMILES'].apply(lambda x: sum([atom.GetIsAromatic() for atom in Chem.MolFromSmiles(x).GetAtoms()]))

In [149]:
# cols.difference(cols_ecfp).difference(cols_fcfp).difference(cols_ptfp).difference(cols_atom).difference(cols_mol).difference(cols_IV)

In [160]:
# find all columns
cols = df_tot.columns

# smiles code
cols_smiles = 'SMILES'

# node-edge level (3 footprints)
cols_ecfp = list(cols[cols.str.contains('ecfp_')]) # ecfp 1024개
cols_fcfp = list(cols[cols.str.contains('fcfp_')]) # fcfp 1024개
cols_ptfp = list(cols[cols.str.contains('ptfp_')]) # ptfp 1024개

# graph level
cols_mol = ['MolWt', 'clogp', 'sa_score', 'qed']

# new feature
cols_atom = list(cols[cols.str.contains('num_')])
cols_IV = list(cols[cols.str.contains('IV_')])
cols_aromatic = ['atoms_isAromatic']


# input cols
cols_input1 = cols_ecfp + cols_fcfp + cols_ptfp
cols_input2 = cols_mol + cols_atom + cols_IV + cols_aromatic
cols_input = cols_input1 + cols_input2

# label
cols_label = 'label'

In [161]:
len(cols_input2)

33

In [162]:
df_train = df_tot[df_tot['idx'] == 'train']
df_valid = df_tot[df_tot['idx'] == 'valid']
df_test = df_tot[df_tot['idx'] == 'test']

In [163]:
from sklearn.model_selection import StratifiedKFold
n_splits=5
random_state = 2020
num_test = len(df_test) # 927

kfold = StratifiedKFold(n_splits=n_splits,
                        random_state=random_state,
                        shuffle=True)

x_train, y_train = df_train[cols_input].values, df_train[cols_label].values
x_valid, y_valid = df_valid[cols_input].values, df_valid[cols_label].values
x_test = df_test[cols_input].values

train_dataset_x = np.vstack([x_train, x_valid])
train_dataset_y = np.hstack([y_train, y_valid])

In [164]:
params = {
    'objective' :'binary',
    'learning_rate' : 0.02,
    'num_leaves' : 60,
    'feature_fraction': 0.9, 
    'bagging_fraction': 0.8, 
    'bagging_freq':1,
    'boosting_type' : 'dart',
    'metric': 'binary_logloss',
    'max_depth' : 32
}

In [165]:
from sklearn.metrics import log_loss, f1_score, accuracy_score
results = np.zeros((kfold.n_splits, num_test), dtype=np.float)

val_f1 = list()
val_acc = list()
for i, (train_idx, valid_idx) in enumerate(kfold.split(train_dataset_x, train_dataset_y)):
    
    x_train, y_train = train_dataset_x[train_idx], train_dataset_y[train_idx]
    x_valid, y_valid = train_dataset_x[valid_idx], train_dataset_y[valid_idx]
    
    
    # 트레인셋 기준으로 기준값 추출
    tr_mean, tr_std = x_train[:, -33:].mean(axis=0), x_train[:, -33:].std(axis=0)
    
    # 노말라이즈 적용
    x_train[:,-33:] = (x_train[:,-33:] - tr_mean) / tr_std
    x_valid[:,-33:] = (x_valid[:,-33:] - tr_mean) / tr_std
    x_test[:,-33:]  = (x_test[:,-33:] - tr_mean) / tr_std
    
    d_train = lgbm.Dataset(x_train, y_train)
    d_valid = lgbm.Dataset(x_valid, y_valid)
    
    model = lgbm.train(params, d_train, 30000, valid_sets=[d_valid], verbose_eval=1000, early_stopping_rounds=50)

    y_valid_pred = (model.predict(x_valid, num_iteration=model.best_iteration) > 0.5).astype(int)

    f1 = f1_score(y_valid, y_valid_pred)
    acc = accuracy_score(y_valid, y_valid_pred)
    
    print(f'Fold {i} | Valid Accuracy: {acc}, F1 Score: {f1}')

    results[i] = model.predict(x_test, num_iteration=model.best_iteration)
    val_f1.append(f1)
    val_acc.append(acc)

print(f'Valid Accuracy: {np.mean(val_acc)}, F1 Score: {np.mean(val_f1)}')

Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.41769
Early stopping, best iteration is:
[1135]	valid_0's binary_logloss: 0.414035
Fold 0 | Valid Accuracy: 0.8210652304009575, F1 Score: 0.8395061728395061
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[560]	valid_0's binary_logloss: 0.45602
Fold 1 | Valid Accuracy: 0.7929383602633154, F1 Score: 0.8105147864184009
Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.432964
Early stopping, best iteration is:
[1102]	valid_0's binary_logloss: 0.429836
Fold 2 | Valid Accuracy: 0.7998801677651288, F1 Score: 0.8172866520787746
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[560]	valid_0's binary_logloss: 0.439739
Fold 3 | Valid Accuracy: 0.793289394847214, F1 Score: 0.8158035237586759


  app.launch_new_instance()
  app.launch_new_instance()


Training until validation scores don't improve for 50 rounds.
[1000]	valid_0's binary_logloss: 0.419241
Early stopping, best iteration is:
[1484]	valid_0's binary_logloss: 0.410243
Fold 4 | Valid Accuracy: 0.8148591971240263, F1 Score: 0.8346709470304976
Valid Accuracy: 0.8044064700801284, F1 Score: 0.823556416425171
