# baseline 2

    설명
    
    1. 데이터셋
    - [O] orgin
    
    2. 전처리
    - [O] stand.
    
    3. lgbm
    - [X] binary classification loss
    - [O] f1 loss

### import library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from rdkit import Chem
%matplotlib inline

import lightgbm as lgbm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, f1_score, accuracy_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### load data

In [2]:
CURRENT_PATH = '/Users/skcc10170/Desktop'

df_train = pd.read_csv(CURRENT_PATH + '/data/org/train_.csv')
df_valid = pd.read_csv(CURRENT_PATH + '/data/org/valid_.csv')
df_test = pd.read_csv(CURRENT_PATH + '/data/org/predict_input.csv')

df_tot = pd.concat([df_train, df_valid, df_test], sort=True).reset_index(drop=True)

### data feature

In [3]:
# find all columns
cols = df_train.columns

# smiles code
cols_smiles = 'SMILES'

# node-edge level (3 footprints)
cols_ecfp = list(cols[cols.str.contains('ecfp_')]) # ecfp 1024개
cols_fcfp = list(cols[cols.str.contains('fcfp_')]) # fcfp 1024개
cols_ptfp = list(cols[cols.str.contains('ptfp_')]) # ptfp 1024개

# graph level
cols_mol = ['MolWt', 'clogp', 'sa_score', 'qed']

# input cols
cols_input1 = cols_ecfp + cols_fcfp + cols_ptfp
cols_input2 = cols_mol
cols_input  = cols_input1 + cols_input2

# label
cols_label = 'label'

### CV start!

In [14]:
n_splits     = 5
random_state = 2020
num_test     = len(df_test) # 927

kfold = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)

In [15]:
x_train, y_train = df_train[cols_input].values, df_train[cols_label].values
x_valid, y_valid = df_valid[cols_input].values, df_valid[cols_label].values
x_test = df_test[cols_input].values

trainset_x = np.vstack([x_train, x_valid])
trainset_y = np.hstack([y_train, y_valid])

In [19]:
params = {
    'objective' :'binary',
    'learning_rate' : 0.03,
    'num_leaves' : 60,
    'feature_fraction': 0.9, 
    'bagging_fraction': 0.8, 
    'bagging_freq':1,
    'boosting_type' : 'dart',
    'metric': 'binary_logloss',
    'max_depth' : 32
}

In [20]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

In [21]:
val_f1, val_acc = [], []
results = np.zeros((kfold.n_splits, num_test), dtype=np.float)

for i, (train_idx, valid_idx) in enumerate(kfold.split(trainset_x, trainset_y)):
    
    # 데이터셋 나눔(train, valid & x,y)
    x_train, y_train = trainset_x[train_idx], trainset_y[train_idx]
    x_valid, y_valid = trainset_x[valid_idx], trainset_y[valid_idx]
    
    # 트레인셋 기준으로 평균값, 표준편차 계산
    ## 수치형 변수 기준
    ### baseline : cols_input2(수치형)
    cktpt = len(cols_input2)
    tr_mean, tr_std = x_train[:, -cktpt:].mean(axis=0), x_train[:, -cktpt:].std(axis=0)
    
    # train/valid 
    x_train[:,-cktpt:] = (x_train[:,-cktpt:] - tr_mean) / tr_std
    x_valid[:,-cktpt:] = (x_valid[:,-cktpt:] - tr_mean) / tr_std
    x_test[:,-cktpt:]  = (x_test[:,-cktpt:]  - tr_mean) / tr_std
    
    d_train = lgbm.Dataset(x_train, y_train)
    d_valid = lgbm.Dataset(x_valid, y_valid)
    
    evals_result = {}
    model = lgbm.train(params, d_train, 30000, valid_sets=[d_valid, d_train], valid_names=['val', 'train'],
                       feval=lgb_f1_score, evals_result=evals_result, verbose_eval=1000, early_stopping_rounds=50)
#     lgb.plot_metric(evals_result, metric='f1')

    y_valid_pred = (model.predict(x_valid, num_iteration=model.best_iteration) > 0.5).astype(int)

    f1 = f1_score(y_valid, y_valid_pred)
    acc = accuracy_score(y_valid, y_valid_pred)
    
    print(f'Fold {i} | Valid Accuracy: {acc}, F1 Score: {f1}')

    results[i] = model.predict(x_test, num_iteration=model.best_iteration)
    val_f1.append(f1)
    val_acc.append(acc)

print(f'Valid Accuracy: {np.mean(val_acc)}, F1 Score: {np.mean(val_f1)}')

Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[94]	train's binary_logloss: 0.472182	train's f1: 0.873727	val's binary_logloss: 0.522799	val's f1: 0.812134
Fold 0 | Valid Accuracy: 0.7881508078994613, F1 Score: 0.8113006396588486
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[15]	train's binary_logloss: 0.592941	train's f1: 0.829071	val's binary_logloss: 0.618189	val's f1: 0.781676
Fold 1 | Valid Accuracy: 0.7259126271693597, F1 Score: 0.7746062992125984
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[125]	train's binary_logloss: 0.451656	train's f1: 0.88	val's binary_logloss: 0.526438	val's f1: 0.788523
Fold 2 | Valid Accuracy: 0.7573397243858598, F1 Score: 0.7837693539775761
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[205]	train's binary_logloss: 0.405782	train's f1: 0.896422	val's 