In [28]:
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from xgboost import XGBRegressor
from imblearn.pipeline import Pipeline

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.metrics import mean_squared_error




In [4]:
warnings.filterwarnings('ignore')  # 경고메세지 무시

# 1. Get data

In [5]:
# data directory
data_dir = './DATA/optiver-realized-volatility-prediction/'

In [6]:
# train, test 데이터 불러오는 함수

def read_data():
    train = pd.read_csv(data_dir + 'train.csv')
    
    # book, trade 데이터를 merge할 때 활요할 key 생성
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    
    return train

In [7]:
data = read_data()

data

Unnamed: 0,stock_id,time_id,target,row_id
0,0,5,0.004136,0-5
1,0,11,0.001445,0-11
2,0,16,0.002168,0-16
3,0,31,0.002195,0-31
4,0,62,0.001747,0-62
...,...,...,...,...
428927,126,32751,0.003461,126-32751
428928,126,32753,0.003113,126-32753
428929,126,32758,0.004070,126-32758
428930,126,32763,0.003357,126-32763


In [8]:
# book, trade 데이터 불러오는 함수

def read_book_trade(stock_id):

    book = pd.read_parquet(data_dir + 'book_train.parquet/stock_id=' + str(stock_id))
    trade = pd.read_parquet(data_dir + 'trade_train.parquet/stock_id=' + str(stock_id))
        
    return book, trade

# 2. Feature engineering

In [9]:
# WAP1 구하는 함수
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# WAP2 구하는 함수
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# 로그 수익률 구하는 함수
def log_return(series):
    return np.log(series).diff()

# realized volatility 구하는 함수
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

def count_unique(series):
    return len(np.unique(series))

In [10]:
# window(10분 동안의 데이터 중 특정 seconds 이후의 데이터)별로 df 추출하는 함수
def get_stats_window(df, create_feature_dict, seconds_in_bucket, add_suffix = False):  # add_suffix: column명 뒤에 문자열 더 추가
    
    # aggregation 적용
    df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
    # 1차원으로 columns명 변경
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]
    # add_suffix 적용
    if add_suffix:
        df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        
    return df_feature

In [11]:
# book 데이터에서 feature 추출하는 함수
def book_preprocessor(stock_id):
    
    df = read_book_trade(stock_id)[0]

    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)

    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)

    df['wap_balance'] = abs(df['wap1'] - df['wap2'])

    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # aggregations dict
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, np.mean, np.std],
        'price_spread':[np.sum, np.mean, np.std],
        'bid_spread':[np.sum, np.mean, np.std],
        'ask_spread':[np.sum, np.mean, np.std],
        'total_volume':[np.sum, np.mean, np.std],
        'volume_imbalance':[np.sum, np.mean, np.std]
    }
    
    # get_stats_window함수 이용하여 각 window별로 df 추출
    df_feature = get_stats_window(df, create_feature_dict, seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 450, add_suffix = True)
    df_feature_300 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 300, add_suffix = True)
    df_feature_150 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 150, add_suffix = True)
    
    # 각 window df들 합치기
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    
    # 불필요한 time_ids drop
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    # Create row_id to merge
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    
    return df_feature

In [12]:
# trade 데이터에서 feature 추출하는 함수
def trade_preprocessor(stock_id):
    
    df = read_book_trade(stock_id)[1]
    
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # aggregations dict
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
    }
    
    # get_stats_window함수 이용하여 각 window별로 df 추출
    df_feature = get_stats_window(df, create_feature_dict, seconds_in_bucket = 0, add_suffix = False)
    df_feature_450 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 450, add_suffix = True)
    df_feature_300 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 300, add_suffix = True)
    df_feature_150 = get_stats_window(df, create_feature_dict, seconds_in_bucket = 150, add_suffix = True)
    
    # 각 window df들 합치기
    df_feature = df_feature.merge(df_feature_450, how = 'left', left_on = 'time_id_', right_on = 'time_id__450')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_150, how = 'left', left_on = 'time_id_', right_on = 'time_id__150')
    
    # 불필요한 time_ids drop
    df_feature.drop(['time_id__450', 'time_id__300', 'time_id__150'], axis = 1, inplace = True)
    
    # columns앞에 문자열 추가
    df_feature = df_feature.add_prefix('trade_')
    
    # Create row_id to merge
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
        
    return df_feature

In [13]:
# book, trade의 모든 stock_id에 대해 book_proprecessor, trade_preprocessor를 적용하여 merge하는 함수
def preprocessor():
    
    # book preprocessor와 trade preprocessor를 merge
    def merge_preprocessors(stock_id):
        
        df_tmp = pd.merge(book_preprocessor(stock_id), trade_preprocessor(stock_id), on = 'row_id', how = 'left')
        
        return df_tmp
    
    # stock_id 모음 series
    stock_ids = pd.read_csv(data_dir + 'train.csv')['stock_id'].unique()
    
    # Parallel를 이용하여 모든 stock_id에 대해 for문
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(merge_preprocessors)(stock_id) for stock_id in stock_ids)
    df = pd.concat(df, ignore_index = True)
    
    return df

In [14]:
data_ = preprocessor()

data_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   47.9s
[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed:  2.4min finished


Unnamed: 0,wap1_sum,wap1_mean,wap1_std,wap2_sum,wap2_mean,wap2_std,log_return1_sum,log_return1_realized_volatility,log_return1_mean,log_return1_std,...,trade_size_sum_450,trade_order_count_mean_450,trade_log_return_realized_volatility_300,trade_seconds_in_bucket_count_unique_300,trade_size_sum_300,trade_order_count_mean_300,trade_log_return_realized_volatility_150,trade_seconds_in_bucket_count_unique_150,trade_size_sum_150,trade_order_count_mean_150
0,303.125061,1.003725,0.000693,303.105539,1.003661,0.000781,0.002292,0.004499,7.613599e-06,0.000260,...,1042.0,2.642857,0.001308,21.0,1587.0,2.571429,0.001701,30.0,2069.0,2.433333
1,200.047768,1.000239,0.000262,200.041171,1.000206,0.000272,0.000360,0.001204,1.810239e-06,0.000086,...,828.0,2.200000,0.000587,16.0,900.0,2.250000,0.000813,24.0,1173.0,2.041667
2,187.913849,0.999542,0.000864,187.939824,0.999680,0.000862,-0.002074,0.002369,-1.109201e-05,0.000173,...,1085.0,3.666667,0.001137,12.0,1189.0,3.166667,0.001621,20.0,2010.0,2.950000
3,119.859781,0.998832,0.000757,119.835941,0.998633,0.000656,-0.002828,0.002574,-2.376661e-05,0.000236,...,514.0,3.666667,0.001089,9.0,1556.0,5.111111,0.001401,11.0,1631.0,4.545455
4,175.932865,0.999619,0.000258,175.934256,0.999626,0.000317,-0.000002,0.001894,-1.057099e-08,0.000144,...,43.0,3.500000,0.000453,11.0,1219.0,4.909091,0.000550,16.0,1570.0,4.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,309.870453,0.999582,0.000486,309.871368,0.999585,0.000613,-0.000527,0.003691,-1.706958e-06,0.000210,...,491.0,1.833333,0.001451,18.0,796.0,2.055556,0.001924,27.0,1426.0,2.259259
428928,223.552139,1.002476,0.001264,223.580322,1.002602,0.001303,0.004436,0.004104,1.998065e-05,0.000275,...,326.0,1.769231,0.001791,20.0,1107.0,3.550000,0.002101,31.0,1550.0,3.161290
428929,256.277039,1.001082,0.000466,256.255066,1.000996,0.000599,0.001525,0.003117,5.979073e-06,0.000196,...,348.0,2.166667,0.001580,24.0,2750.0,2.541667,0.001913,31.0,3493.0,2.838710
428930,399.721741,1.001809,0.000456,399.714325,1.001790,0.000507,0.000256,0.003661,6.429271e-07,0.000184,...,2300.0,2.727273,0.001520,43.0,5150.0,2.813953,0.001714,62.0,7261.0,2.822581


In [15]:
# target 데이터가 있는 train에 preprocessor를 통해 얻은 train_ 부착

data = data.merge(data_, on = ['row_id'], how = 'left')

data

Unnamed: 0,stock_id,time_id,target,row_id,wap1_sum,wap1_mean,wap1_std,wap2_sum,wap2_mean,wap2_std,...,trade_size_sum_450,trade_order_count_mean_450,trade_log_return_realized_volatility_300,trade_seconds_in_bucket_count_unique_300,trade_size_sum_300,trade_order_count_mean_300,trade_log_return_realized_volatility_150,trade_seconds_in_bucket_count_unique_150,trade_size_sum_150,trade_order_count_mean_150
0,0,5,0.004136,0-5,303.125061,1.003725,0.000693,303.105539,1.003661,0.000781,...,1042.0,2.642857,0.001308,21.0,1587.0,2.571429,0.001701,30.0,2069.0,2.433333
1,0,11,0.001445,0-11,200.047768,1.000239,0.000262,200.041171,1.000206,0.000272,...,828.0,2.200000,0.000587,16.0,900.0,2.250000,0.000813,24.0,1173.0,2.041667
2,0,16,0.002168,0-16,187.913849,0.999542,0.000864,187.939824,0.999680,0.000862,...,1085.0,3.666667,0.001137,12.0,1189.0,3.166667,0.001621,20.0,2010.0,2.950000
3,0,31,0.002195,0-31,119.859781,0.998832,0.000757,119.835941,0.998633,0.000656,...,514.0,3.666667,0.001089,9.0,1556.0,5.111111,0.001401,11.0,1631.0,4.545455
4,0,62,0.001747,0-62,175.932865,0.999619,0.000258,175.934256,0.999626,0.000317,...,43.0,3.500000,0.000453,11.0,1219.0,4.909091,0.000550,16.0,1570.0,4.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,126,32751,0.003461,126-32751,309.870453,0.999582,0.000486,309.871368,0.999585,0.000613,...,491.0,1.833333,0.001451,18.0,796.0,2.055556,0.001924,27.0,1426.0,2.259259
428928,126,32753,0.003113,126-32753,223.552139,1.002476,0.001264,223.580322,1.002602,0.001303,...,326.0,1.769231,0.001791,20.0,1107.0,3.550000,0.002101,31.0,1550.0,3.161290
428929,126,32758,0.004070,126-32758,256.277039,1.001082,0.000466,256.255066,1.000996,0.000599,...,348.0,2.166667,0.001580,24.0,2750.0,2.541667,0.001913,31.0,3493.0,2.838710
428930,126,32763,0.003357,126-32763,399.721741,1.001809,0.000456,399.714325,1.001790,0.000507,...,2300.0,2.727273,0.001520,43.0,5150.0,2.813953,0.001714,62.0,7261.0,2.822581


In [16]:
# realized volatility를 stock과 time별로 groupby했을 때의 정보를 추가하는 함수
def get_time_stock(df):  # df에는 train과 preprocessor가 merge된 df가 들어가야 함
    
    # Get realized volatility columns
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_450', 'log_return2_realized_volatility_450', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_150', 'log_return2_realized_volatility_150', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_450', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_150']

    # Group by stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
        # Rename columns using add_suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by time id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    
    return df

In [17]:
data = get_time_stock(data)

In [18]:
# data의 전체 feature

for col in data.columns:
    print(col)

stock_id
time_id
target
row_id
wap1_sum
wap1_mean
wap1_std
wap2_sum
wap2_mean
wap2_std
log_return1_sum
log_return1_realized_volatility
log_return1_mean
log_return1_std
log_return2_sum
log_return2_realized_volatility
log_return2_mean
log_return2_std
wap_balance_sum
wap_balance_mean
wap_balance_std
price_spread_sum
price_spread_mean
price_spread_std
bid_spread_sum
bid_spread_mean
bid_spread_std
ask_spread_sum
ask_spread_mean
ask_spread_std
total_volume_sum
total_volume_mean
total_volume_std
volume_imbalance_sum
volume_imbalance_mean
volume_imbalance_std
wap1_sum_450
wap1_mean_450
wap1_std_450
wap2_sum_450
wap2_mean_450
wap2_std_450
log_return1_sum_450
log_return1_realized_volatility_450
log_return1_mean_450
log_return1_std_450
log_return2_sum_450
log_return2_realized_volatility_450
log_return2_mean_450
log_return2_std_450
wap_balance_sum_450
wap_balance_mean_450
wap_balance_std_450
price_spread_sum_450
price_spread_mean_450
price_spread_std_450
bid_spread_sum_450
bid_spread_mean_450
bid_

# 3. XG Boost

## (1) No hyperparameter tuning
-> 잘 나옴! (RMSPE: 0.259)

In [19]:
# train 및 test를 수행하는 함수
def train_and_test(df):
  
    model = XGBRegressor()
    
    # 범주형 변수인 stock_id에 원핫 인코딩 적용
    dummies = pd.get_dummies(df['stock_id'])
    df = pd.concat([dummies, df], axis=1)
    
    # X, y 데이터로 분류
    X = data.drop(['stock_id', 'row_id', 'target', 'time_id'], axis = 1)
    y = data['target']
    
    # train, vlaidation, test 데이터로 분류
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) # stratify: 비율 유지하면서 split
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/8, random_state=456)
    
    # model 설정 & fitting
    final_model = XGBRegressor(n_estimators = 100,
                               learning_rate = 0.1,
                               colsample_bytree = 0.8,
                               subsample = 0.8      
                                )
    
    final_model.fit( X_train, y_train, eval_metric = 'rmse', eval_set =[(X_train, y_train), (X_val, y_val)],early_stopping_rounds = 30 )
    
    # prdict
    y_pred = final_model.predict(X_test)
    
    # RMSPE 계산 (y_test 값으로 scailing 해줌으로서 좀더 직관적인 평가지표 제공)
    rmspe = np.sqrt(np.mean(np.square(((y_pred - y_test) / y_test)), axis=0))
    
    return y_pred, y_test, rmspe

In [20]:
y_pred, y_test, rmspe = train_and_test(data)

[0]	validation_0-rmse:0.44652	validation_1-rmse:0.44652
[1]	validation_0-rmse:0.40187	validation_1-rmse:0.40187
[2]	validation_0-rmse:0.36168	validation_1-rmse:0.36168
[3]	validation_0-rmse:0.32551	validation_1-rmse:0.32552
[4]	validation_0-rmse:0.29296	validation_1-rmse:0.29297
[5]	validation_0-rmse:0.26367	validation_1-rmse:0.26367
[6]	validation_0-rmse:0.23730	validation_1-rmse:0.23731
[7]	validation_0-rmse:0.21357	validation_1-rmse:0.21358
[8]	validation_0-rmse:0.19222	validation_1-rmse:0.19222
[9]	validation_0-rmse:0.17300	validation_1-rmse:0.17300
[10]	validation_0-rmse:0.15570	validation_1-rmse:0.15570
[11]	validation_0-rmse:0.14013	validation_1-rmse:0.14014
[12]	validation_0-rmse:0.12612	validation_1-rmse:0.12612
[13]	validation_0-rmse:0.11351	validation_1-rmse:0.11351
[14]	validation_0-rmse:0.10216	validation_1-rmse:0.10216
[15]	validation_0-rmse:0.09195	validation_1-rmse:0.09195
[16]	validation_0-rmse:0.08275	validation_1-rmse:0.08276
[17]	validation_0-rmse:0.07448	validation

In [21]:
y_pred

array([0.00370526, 0.00680944, 0.00114997, ..., 0.00145014, 0.00314139,
       0.00437999], dtype=float32)

In [22]:
y_test

80707     0.002899
372527    0.007649
403214    0.001238
346182    0.008873
163964    0.001458
            ...   
274836    0.006823
268491    0.017986
78583     0.001671
149725    0.003329
345033    0.005183
Name: target, Length: 85787, dtype: float64

In [23]:
rmspe

0.26217641080364146

## (2) With yperparameter tuning
-> 아직 안 돌려봄. parameter 어떤 걸, 어떻게 tuning할지 더 고민해봐야 함

In [24]:
data

Unnamed: 0,stock_id,time_id,target,row_id,wap1_sum,wap1_mean,wap1_std,wap2_sum,wap2_mean,wap2_std,...,trade_log_return_realized_volatility_450_max_time,trade_log_return_realized_volatility_450_min_time,trade_log_return_realized_volatility_300_mean_time,trade_log_return_realized_volatility_300_std_time,trade_log_return_realized_volatility_300_max_time,trade_log_return_realized_volatility_300_min_time,trade_log_return_realized_volatility_150_mean_time,trade_log_return_realized_volatility_150_std_time,trade_log_return_realized_volatility_150_max_time,trade_log_return_realized_volatility_150_min_time
0,0,5,0.004136,0-5,303.125061,1.003725,0.000693,303.105539,1.003661,0.000781,...,0.003242,0.000543,0.001820,0.000692,0.004595,0.000710,0.002286,0.000836,0.005362,0.000888
1,0,11,0.001445,0-11,200.047768,1.000239,0.000262,200.041171,1.000206,0.000272,...,0.002701,0.000000,0.000906,0.000460,0.002783,0.000000,0.001140,0.000583,0.002851,0.000000
2,0,16,0.002168,0-16,187.913849,0.999542,0.000864,187.939824,0.999680,0.000862,...,0.002751,0.000114,0.001100,0.000428,0.003082,0.000497,0.001347,0.000484,0.003414,0.000717
3,0,31,0.002195,0-31,119.859781,0.998832,0.000757,119.835941,0.998633,0.000656,...,0.003404,0.000000,0.001052,0.000600,0.004218,0.000000,0.001349,0.000698,0.004974,0.000269
4,0,62,0.001747,0-62,175.932865,0.999619,0.000258,175.934256,0.999626,0.000317,...,0.001936,0.000158,0.000812,0.000372,0.002470,0.000278,0.001036,0.000466,0.003281,0.000317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,126,32751,0.003461,126-32751,309.870453,0.999582,0.000486,309.871368,0.999585,0.000613,...,0.003079,0.000000,0.001261,0.000485,0.004049,0.000534,0.001576,0.000651,0.004783,0.000647
428928,126,32753,0.003113,126-32753,223.552139,1.002476,0.001264,223.580322,1.002602,0.001303,...,0.003531,0.000000,0.001008,0.000687,0.006310,0.000329,0.001241,0.000814,0.007915,0.000404
428929,126,32758,0.004070,126-32758,256.277039,1.001082,0.000466,256.255066,1.000996,0.000599,...,0.001669,0.000000,0.001055,0.000376,0.001995,0.000000,0.001306,0.000422,0.002566,0.000000
428930,126,32763,0.003357,126-32763,399.721741,1.001809,0.000456,399.714325,1.001790,0.000507,...,0.003270,0.000400,0.001474,0.000591,0.005284,0.000686,0.001839,0.000731,0.006914,0.001004


In [25]:
# train 및 test를 수행하는 함수
def train_and_test(df):
  
    model = XGBRegressor()
    
    # 범주형 변수인 stock_id에 원핫 인코딩 적용
    dummies = pd.get_dummies(df['stock_id'])
    df = pd.concat([dummies, df], axis=1)
    
    # X, y 데이터로 분류
    X = data.drop(['stock_id', 'row_id', 'target', 'time_id'], axis = 1)
    y = data['target']
    
    # train, vlaidation, test 데이터로 분류
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) # stratify: 비율 유지하면서 split
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/8, random_state=456)
    
    # Function for tuning hyperparameter
    def tuning_hyperparameter():
        params = {'XGBRegressor__n_estimators' :[100, 200, 300, 400, 500],
          'XGBRegressor__learning_rate': [0.05, 0.1, 0.3, 0.5, 1],
          'XGBRegressor__min_child_weight':[0, 0.1, 0.5],
          'XGBRegressor__gamma' : [0, 0.1, 0.5, 1],                   # 관측치에 대한 가중치 합의 최소 (높을수록 overfitting 방지)
          'XGBRegressor__colsample_bytree': [0.6, 0.7, 0.8],          # 각 iteration에 사용되는 feature의 비율
          'XGBRegressor__subsample': [0.6, 0.7, 0.8]                  # 각 iteration에 사용되는 data의 비율
        }
        
        grid_xgb = RandomizedSearchCV( model,
                              param_distributions = params,
                              n_iter = 25,
                              cv = 5,
                              scoring = 'neg_mean_squared_error',
                              verbose = -1
                              )
        grid_xgb.fit( X_train, y_train )
        
        print('Best parameters: ', grid_xgb.best_params_)
        print('Best score: ', grid_xgb.best_score_)
        
        return grid_xgb.best_params_
    
    best_parameter = tuning_hyperparameter()
    best_n_estimators = best_parameter['XGBRegressor__n_estimators']
    best_learning_rate = best_parameter['XGBRegressor__learning_rate']
    best_min_child_weight = best_parameter['XGBRegressor__min_child_weight']
    best_gamma = best_parameter['XGBRegressor__gamma']
    best_colsample_bytree = best_parameter['XGBRegressor__colsample_bytree']
    best_subsample = best_parameter['XGBRegressor__subsample']
    
    final_model = XGBRegressor(n_estimators = best_n_estimators,
                               learning_rate = best_learning_rate,
                               min_child_weight = best_min_child_weight,
                               gamma = best_gamma,
                               colsample_bytree = best_colsample_bytree,
                               subsample = best_subsample      
    )
    
    final_model.fit( X_train, y_train, eval_metric = 'rmse', eval_set =[(X_train, y_train), (X_val, y_val)],early_stopping_rounds = 30 )
    
    # Get predictions
    y_pred = final_model.predict(X_test)
    
    # Calculate RMSPE
    rmspe = np.sqrt(np.mean(np.square(((y_pred - y_test) / y_test)), axis=0))
    
    return y_pred, y_test, rmspe

In [26]:
def objectiveXGB( trial: Trial, X, y, test):

    param = {
        "n_estimators" : trial.suggest_int('n_estimators', 500, 4000),
        'max_depth':trial.suggest_int('max_depth', 8, 16),
        'min_child_weight':trial.suggest_int('min_child_weight', 1, 300),
        'gamma':trial.suggest_int('gamma', 1, 3),
        'learning_rate': 0.01,
        'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.5, 1, 0.1),
        'nthread' : -1,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'subsample': trial.suggest_categorical('subsample', [0.6,0.7,0.8,1.0] ),
        'random_state': 42
    }

    model = XGBRegressor(**param)
    xgb_model = model.fit(X, y, eval_metric = 'rmse')

    score = mean_squared_error(xgb_model.predict(X), y, squared=False)

    return score

def train_and_test_optuna(df):
  
    model = XGBRegressor()
    
    # 범주형 변수인 stock_id에 원핫 인코딩 적용
    dummies = pd.get_dummies(df['stock_id'])
    df = pd.concat([dummies, df], axis=1)
    
    # X, y 데이터로 분류
    X = data.drop(['stock_id', 'row_id', 'target', 'time_id'], axis = 1)
    y = data['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) # stratify: 비율 유지하면서 split
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/8, random_state=456)
    

    study = optuna.create_study(direction='minimize', sampler=TPESampler())

    study.optimize(lambda trial: objectiveXGB(trial, X_train, y_train, X_test), n_trials=50)
    print(f'Best trial : {study.best_trial.value}, \nparams {study.best_trial.params}')

    return study.best_trial.params


In [29]:
train_and_test_optuna(data)

[32m[I 2022-10-18 02:42:31,355][0m A new study created in memory with name: no-name-87844b84-8aae-45ea-96fa-7493a0c468ca[0m
[33m[W 2022-10-18 02:42:31,831][0m Trial 0 failed because of the following error: XGBoostError('[02:42:31] /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/gbm/../common/common.h:239: XGBoost version not compiled with GPU support.\nStack trace:\n  [bt] (0) 1   libxgboost.dylib                    0x0000000176a847a8 dmlc::LogMessageFatal::~LogMessageFatal() + 124\n  [bt] (1) 2   libxgboost.dylib                    0x0000000176b24da0 xgboost::gbm::GBTree::ConfigureUpdaters() + 436\n  [bt] (2) 3   libxgboost.dylib                    0x0000000176b2498c xgboost::gbm::GBTree::Configure(std::__1::vector<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std:

XGBoostError: [02:42:31] /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/gbm/../common/common.h:239: XGBoost version not compiled with GPU support.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000176a847a8 dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x0000000176b24da0 xgboost::gbm::GBTree::ConfigureUpdaters() + 436
  [bt] (2) 3   libxgboost.dylib                    0x0000000176b2498c xgboost::gbm::GBTree::Configure(std::__1::vector<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > > const&) + 964
  [bt] (3) 4   libxgboost.dylib                    0x0000000176b4087c xgboost::LearnerConfiguration::Configure() + 1016
  [bt] (4) 5   libxgboost.dylib                    0x0000000176b40b9c xgboost::LearnerImpl::UpdateOneIter(int, std::__1::shared_ptr<xgboost::DMatrix>) + 128
  [bt] (5) 6   libxgboost.dylib                    0x0000000176a88524 XGBoosterUpdateOneIter + 140
  [bt] (6) 7   libffi.8.dylib                      0x00000001051c404c ffi_call_SYSV + 76
  [bt] (7) 8   libffi.8.dylib                      0x00000001051c1790 ffi_call_int + 1256
  [bt] (8) 9   _ctypes.cpython-39-darwin.so        0x00000001051a416c _ctypes_callproc + 772

