In [23]:
import glob
import os
import pickle
import datetime

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

from mymodule import pre_train_test, base_feature, spearman_and_mse,\
                     load_model, ensemble, pre_submit

In [29]:
def preprocess(df,
               train=True,
               drops=['protein_sequence'],
               save=False,
               save_name=None):
    # featuring from mymodule.py
    df = base_feature(df)
    #split data for training
    idx, x, y = pre_train_test(df, train=train,  drops=drops)
    # saving preprocessed_data
    if save:
        if save_name is None: raise NameError ('(arg: save_name) is None')
        df.to_csv('dataset/featured/' + save_name, index=False)

    return  idx, x, y
# save_name = 'base.csv'
# idx, x, y = preprocess(df,save_name=save_name, save=True)

In [4]:
base_df = pd.read_csv('dataset/featured/base.csv')
drops = ['protein_sequence']
idx_train, x_train, y_train = pre_train_test(base_df, train=True, drops=drops)

In [5]:
def training(x, y ,n_splits , params ,verbose ,save=True , file_path=None):

    pred = {'val': pd.Series([None]*len(y)),
            'pred': pd.Series([None]*len(y))}

    # KFold
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=123)
    fold = 0
    for train_idx, val_idx in cv.split(x, y):
        print('-'*20, f' fold_{fold} ', '-'*20)

        x_tr, y_tr, idx_tr = x_train.loc[train_idx, :],\
                            y_train.loc[train_idx],\
                            idx_train.loc[train_idx]
        x_val, y_val, idx_val = x_train.loc[val_idx, :],\
                                y_train.loc[val_idx],\
                                idx_train.loc[val_idx]

        # train model
        model = lgb.LGBMRegressor(**params)
        model.fit(
            x_tr, y_tr,
            eval_set=[(x_tr, y_tr),
            (x_val, y_val)],
            early_stopping_rounds=50,
            verbose=verbose)

        # evaluate prediction with spreaman_correlation_coefficient
        y_pred = model.predict(x_val)
        corr, mse = spearman_and_mse(y_val, y_pred)
        print('correlation: {:.5}'.format(corr,),' | mse: {:.1}'.format(mse))
        
        pred['val'][idx_val] = y_val
        pred['pred'][idx_val] = y_pred

        # save model to file_path
        path = file_path + f'_fold{fold}'
        if save:
            with open(path, 'wb') as f:
                pickle.dump(model, f)
        
        fold += 1
        print()
        
    return pred

In [6]:
x = x_train
y = y_train

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression_l1', 
    'metric': 'mean_absolute_error',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'subsample': 0.7,
    'subsample_freq': 1,
    'feature_fraction': 0.8,
    'min_data_in_leaf': 50,
    'min_sum_hessian_in_leaf': 50,
    'n_estimators': 10000,
    "random_state": 123,
    "importance_type": "gain",
}

file_name = 'base_test'
file_path = os.path.join('models', file_name)
n_splits = 5
verbose = 1000
result = training(x, y ,n_splits ,params ,verbose ,True ,file_path)

--------------------  fold_0  --------------------
Training until validation scores don't improve for 50 rounds
[1000]	training's l1: 5.30831	valid_1's l1: 6.32239
[2000]	training's l1: 4.8933	valid_1's l1: 6.21642
[3000]	training's l1: 4.66114	valid_1's l1: 6.16864
[4000]	training's l1: 4.4927	valid_1's l1: 6.13405
Early stopping, best iteration is:
[4003]	training's l1: 4.49244	valid_1's l1: 6.13401
correlation: 0.99999  | mse: 8e+01

--------------------  fold_1  --------------------
Training until validation scores don't improve for 50 rounds
[1000]	training's l1: 5.24272	valid_1's l1: 6.46682
[2000]	training's l1: 4.82009	valid_1's l1: 6.37013
[3000]	training's l1: 4.5682	valid_1's l1: 6.32661
Early stopping, best iteration is:
[3380]	training's l1: 4.50214	valid_1's l1: 6.31851
correlation: 0.99999  | mse: 9e+01

--------------------  fold_2  --------------------
Training until validation scores don't improve for 50 rounds
[1000]	training's l1: 5.26173	valid_1's l1: 6.33258
[2000