### import library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Descriptors
%matplotlib inline

import lightgbm as lgbm

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import log_loss, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix, roc_curve, auc

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### load data

In [2]:
CURRENT_PATH = '/Users/skcc10170/Desktop'

df_train = pd.read_csv(CURRENT_PATH + '/data/org/train_.csv')
df_valid = pd.read_csv(CURRENT_PATH + '/data/org/valid_.csv')
df_test = pd.read_csv(CURRENT_PATH + '/data/org/predict_input.csv')

df_train['type'] = 'train'
df_valid['type'] = 'valid'
df_test['type']  = 'test'

df_tot = pd.concat([df_train, df_valid, df_test], sort=True).reset_index(drop=True)

### feature generating from molecule

In [3]:
df_tot['myf_NumAtoms']            = df_tot.SMILES.apply(lambda x: Chem.MolFromSmiles(x).GetNumAtoms())
df_tot['myf_NumHeavyAtoms']       = df_tot.SMILES.apply(lambda x: Chem.MolFromSmiles(x).GetNumHeavyAtoms())
df_tot['myf_GetNumBonds']         = df_tot.SMILES.apply(lambda x: Chem.MolFromSmiles(x).GetNumBonds())
df_tot['myf_GetNumHeavyBonds']    = df_tot.SMILES.apply(lambda x: Chem.MolFromSmiles(x).GetNumBonds(onlyHeavy=True))

df_tot['myf_ExactMolWt']          = df_tot.SMILES.apply(lambda x: Descriptors.ExactMolWt(Chem.MolFromSmiles(x)))
df_tot['myf_HeavyAtomMolWt']      = df_tot.SMILES.apply(lambda x: Descriptors.HeavyAtomMolWt(Chem.MolFromSmiles(x)))
df_tot['myf_FpDensityMorgan1']    = df_tot.SMILES.apply(lambda x: Descriptors.FpDensityMorgan1(Chem.MolFromSmiles(x)))
df_tot['myf_FpDensityMorgan2']    = df_tot.SMILES.apply(lambda x: Descriptors.FpDensityMorgan2(Chem.MolFromSmiles(x)))
df_tot['myf_FpDensityMorgan3']    = df_tot.SMILES.apply(lambda x: Descriptors.FpDensityMorgan3(Chem.MolFromSmiles(x)))
df_tot['myf_MaxAbsPartialCharge'] = df_tot.SMILES.apply(lambda x: Descriptors.MaxAbsPartialCharge(Chem.MolFromSmiles(x)))
df_tot['myf_MaxPartialCharge']    = df_tot.SMILES.apply(lambda x: Descriptors.MaxPartialCharge(Chem.MolFromSmiles(x)))
df_tot['myf_MinAbsPartialCharge'] = df_tot.SMILES.apply(lambda x: Descriptors.MinAbsPartialCharge(Chem.MolFromSmiles(x)))
df_tot['myf_MinPartialCharge']    = df_tot.SMILES.apply(lambda x: Descriptors.MinPartialCharge(Chem.MolFromSmiles(x)))
df_tot['myf_MolWt']               = df_tot.SMILES.apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
df_tot['myf_NumRadicalElectrons'] = df_tot.SMILES.apply(lambda x: Descriptors.NumRadicalElectrons(Chem.MolFromSmiles(x)))
df_tot['myf_NumValenceElectrons'] = df_tot.SMILES.apply(lambda x: Descriptors.NumValenceElectrons(Chem.MolFromSmiles(x)))

df_tot['myf_NumDoubleBondType'] = df_tot.SMILES.apply(lambda x: len([1 for b in Chem.MolFromSmiles(x).GetBonds() if b.GetBondTypeAsDouble() == 1.0]))

### data feature

In [4]:
# find all columns
cols = df_train.columns

# smiles code
cols_smiles = 'SMILES'

# node-edge level (3 footprints)
cols_ecfp = list(cols[cols.str.contains('ecfp_')]) # ecfp 1024개
cols_fcfp = list(cols[cols.str.contains('fcfp_')]) # fcfp 1024개
cols_ptfp = list(cols[cols.str.contains('ptfp_')]) # ptfp 1024개

# graph level
cols_mol = ['MolWt', 'clogp', 'sa_score', 'qed']

### new features
cols_new_f = list(cols[cols.str.contains('myf_')])

# input cols
cols_input1 = cols_ecfp + cols_fcfp + cols_ptfp # don't have to normalize
cols_input2 = cols_mol + cols_new_f # have to normalize
cols_input  = cols_input1 + cols_input2

# label
cols_label = 'label'

### CV start!

In [5]:
n_splits     = 5
random_state = 2020
random_seed  = 2020
data_random_seed = 2020
feature_fraction_seed = 2020

num_test     = len(df_test) # 927

kfold = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)

In [6]:
x_train, y_train = df_train[cols_input].values, df_train[cols_label].values
x_valid, y_valid = df_valid[cols_input].values, df_valid[cols_label].values
x_test            = df_test[cols_input].values

trainset_x = np.vstack([x_train, x_valid])
trainset_y = np.hstack([y_train, y_valid])

In [22]:
params_org = {
    'objective' :'binary',
    'learning_rate' : 0.008,
    'num_leaves' : 120,
    'feature_fraction': 0.63, 
    'bagging_fraction': 0.8, 
    'bagging_freq':1,
    'boosting_type' : 'dart',
    'metric': {'binary_logloss'},
    'max_depth' : 20
}

In [23]:
val_f1, val_acc, val_loss = [], [], []
results = np.zeros((kfold.n_splits, num_test), dtype=np.float)
prob_results = np.zeros(len(trainset_x), dtype=np.float)

for i, (train_idx, valid_idx) in enumerate(kfold.split(trainset_x, trainset_y)):

    # 데이터셋 나눔(train, valid & x,y)
    x_train, y_train = trainset_x[train_idx], trainset_y[train_idx]
    x_valid, y_valid = trainset_x[valid_idx], trainset_y[valid_idx]
    x_test           = df_test[cols_input].values

    # 트레인셋 기준으로 평균값, 표준편차 계산
    ## 수치형 변수 기준
    ### baseline : cols_input2(수치형)
    cktpt = len(cols_input2)
    tr_mean, tr_std = x_train[:, -cktpt:].mean(axis=0), x_train[:, -cktpt:].std(axis=0)

    # train/valid 
    x_train[:,-cktpt:] = (x_train[:,-cktpt:] - tr_mean) / (tr_std + 1e-5)
    x_valid[:,-cktpt:] = (x_valid[:,-cktpt:] - tr_mean) / (tr_std + 1e-5)
    x_test[:,-cktpt:]  = (x_test[:,-cktpt:]  - tr_mean) / (tr_std + 1e-5)

    d_train = lgbm.Dataset(x_train, y_train)
    d_valid = lgbm.Dataset(x_valid, y_valid)

    model = lgbm.train(params, d_train, 30000, valid_sets=[d_valid, d_train], valid_names=['val', 'train'],
                       verbose_eval=1000, early_stopping_rounds=100)

    valid_pred_prob = model.predict(x_valid, num_iteration=model.best_iteration)
    prob_results[valid_idx] = valid_pred_prob
    y_valid_pred = (valid_pred_prob > 0.5).astype(int)

    f1 = f1_score(y_valid, y_valid_pred)
    acc = accuracy_score(y_valid, y_valid_pred)
    loss = model.best_score['val']['binary_logloss']

    print(f'Fold {i} | Valid Accuracy: {acc}, F1 Score: {f1}')

    results[i] = model.predict(x_test, num_iteration=model.best_iteration)
    val_f1.append(f1)
    val_acc.append(acc)
    val_loss.append(loss)

print(f'Valid Accuracy: {np.mean(val_acc)}, F1 Score: {np.mean(val_f1)}')




df_result = df_tot[['type','SMILES','label']].copy()
df_result.loc[:, '1'] = pd.Series(np.hstack([prob_results, results.mean(axis=0)]))
df_result.loc[:, 'predict'] = (df_result['1'] > 0.5).astype(int)
df_result = df_result[['type', 'SMILES', '1', 'label', 'predict']]

OUTPUT_PATH = '/Users/skcc10170/Desktop/data/model_result/'
output_name = 'model_acc_' + "{:.4f}".format(np.mean(val_acc)) \
                           + '_f1_' + "{:.4f}".format(np.mean(val_f1)) \
                           + '_loss_' + "{:.4f}".format(np.mean(val_loss))

df_result.to_csv(OUTPUT_PATH + output_name + '.csv')
# parameterset
pd.DataFrame(params).to_json(OUTPUT_PATH + output_name + '.json')
print(output_name)

Training until validation scores don't improve for 100 rounds.
[1000]	train's binary_logloss: 0.326967	val's binary_logloss: 0.453842
[2000]	train's binary_logloss: 0.236678	val's binary_logloss: 0.422757
Early stopping, best iteration is:
[2705]	train's binary_logloss: 0.192705	val's binary_logloss: 0.413111
Fold 0 | Valid Accuracy: 0.8162776780371035, F1 Score: 0.834679590737749
Training until validation scores don't improve for 100 rounds.
[1000]	train's binary_logloss: 0.320799	val's binary_logloss: 0.469229
[2000]	train's binary_logloss: 0.232802	val's binary_logloss: 0.440222
[3000]	train's binary_logloss: 0.177855	val's binary_logloss: 0.429612
Early stopping, best iteration is:
[3730]	train's binary_logloss: 0.148	val's binary_logloss: 0.426825
Fold 1 | Valid Accuracy: 0.7971274685816876, F1 Score: 0.8132231404958677
Training until validation scores don't improve for 100 rounds.
[1000]	train's binary_logloss: 0.323036	val's binary_logloss: 0.468626
[2000]	train's binary_logloss