## Main code for Kaggle - Optiver Realized Volatility Prediction
@LaurentMombaerts 

In [1]:
%reset

## MACHINE TO SET UP

In [2]:
###########################
machine = 'local'
###########################

**Lib Import / Data loading**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import glob
import time

# Parallel Computing
from joblib import Parallel, delayed

# ML
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import GroupShuffleSplit

# Maths
from scipy.interpolate import interp1d
# from arch import arch_model

# Paths tricks
import os
from pathlib import Path

# Support code
from support_file import *
from information_measures import *

if machine == 'local':
    datapath = os.path.join(str(Path.home()), 'ownCloud', 'Data', 'Kaggle', 'optiver-realized-volatility-prediction')

    # Load dataset
    train = pd.read_csv(os.path.join(datapath,'train.csv')) 
    all_stocks_ids = train['stock_id'].unique()
    all_time_ids = train['time_id'].unique()

    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    train = train[['row_id','target']]

    # Load test ids
    test = pd.read_csv(os.path.join(datapath,'test.csv'))
    all_stocks_ids_test = test['stock_id'].unique()
    test = test.drop(['stock_id','time_id'],axis=1)
    
elif machine == 'kaggle':
    
    # Load dataset
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    all_stocks_ids = train['stock_id'].unique()
    all_time_ids = train['time_id'].unique()

    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    train = train[['row_id','target']]

    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv') 
    all_stocks_ids_test = test['stock_id'].unique()
    test = test.drop(['stock_id','time_id'],axis=1)
    
    datapath = 0
    

**Functions**

In [10]:
def trainModel_timeSplit(X,y,groups,model,splits):
    
    rmspe_list = []
    
    for random_split in range(splits):
        gss = GroupShuffleSplit(n_splits=1, train_size=.80, random_state=random_split)
        gss.get_n_splits()

        for train, test in gss.split(X, y, groups):
            # CV definition
            X_train, X_test = X.iloc[train,:], X.iloc[test,:]
            y_train, y_test = y[train],y[test]
            
            # Add other stocks volatility at same time id and this stock overall volatility
            X_test = get_time_stock(X_test).drop(['time_id','stock_id'],axis=1)
            X_train = get_time_stock(X_train).drop(['time_id','stock_id'],axis=1)
    
            # Model definition
            model.fit(X_train,y_train)
            yhat = model.predict(X_test)
    
            # Estimate perf
            rmspe_list.append(rmspe(y_test, yhat))
            
    return rmspe_list

# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    
    # df['stock_id'] = [df['row_id'][i].split('-')[0] for i in range(df.shape[0])]
    # df['time_id'] = [df['row_id'][i].split('-')[1] for i in range(df.shape[0])]
            
    # Get realized volatility columns
    vol_cols = ['log_return1_realized_volatility', 'log_returnMidprice_realized_volatility']

    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the time id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    
    return df

def train_CatBoost_cv(df_features_train,targets,splits):
    
    model = CatBoostRegressor(verbose=0)

    # Data input / output definition
    X = df_features_train.fillna(0)
    y = targets
    time_id_groups = [df_features_train['row_id'][i].split('-')[1] for i in range(df_features_train.shape[0])]
   
    rmspe_list = trainModel_timeSplit(X,y,time_id_groups,model,splits)
    
    return rmspe_list

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    
    y_true = lgb_train.get_label()
    
    return 'RMSPE', rmspe(y_true, y_pred), False

def train_lgbm_cv(df_features_train,targets,splits):
    
    # Data input / output definition
    X = df_features_train.fillna(0)
    y = targets
    time_id_groups = [df_features_train['time_id'][i] for i in range(df_features_train.shape[0])]

    # Hyperparammeters (just basic)
    params = {
      'objective': 'rmse',  
      'boosting_type': 'gbdt',
      'num_leaves': 100,
      'n_jobs': -1,
      'learning_rate': 0.1,
      'feature_fraction': 0.8,
      'bagging_fraction': 0.8,
      'verbose': -1
    }

    rmspe_list = []

    for random_split in range(splits):
        gss = GroupShuffleSplit(n_splits=1, train_size=.80, random_state=random_split)
        gss.get_n_splits()

        for train, test in gss.split(X, y, time_id_groups):
            # CV definition
            x_train, x_val = X.iloc[train,:].reset_index(drop=True), X.iloc[test,:].reset_index(drop=True)
            y_train, y_val = y[train].reset_index(drop=True),y[test].reset_index(drop=True)

            # Add other stocks volatility at same time id and this stock overall volatility
            x_val = get_time_stock(x_val).drop(['time_id'],axis=1)
            x_val['stock_id'] = x_val['stock_id'].astype(int)
            x_train = get_time_stock(x_train).drop(['time_id'],axis=1)
            x_train['stock_id'] = x_train['stock_id'].astype(int)

            # Root mean squared percentage error weights
            train_weights = 1 / np.square(y_train)
            val_weights = 1 / np.square(y_val)
            train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
            val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])

            # Model definition
            model = lgb.train(params = params, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          num_boost_round = 10000, 
                          early_stopping_rounds = 50, 
                          verbose_eval = 50,
                          feval = feval_rmspe)

            yhat = model.predict(x_val) 

            # Estimate perf
            rmspe_list.append(rmspe(y_val, yhat))
            
    return rmspe_list

def train_lgbm(df_features_train,targets):
    
    # Data input / output definition
    X = df_features_train.fillna(0)
    X['stock_id'] = X['stock_id'].astype(int)
    y = targets

    # Hyperparammeters (just basic)
    params = {
      'objective': 'rmse',  
      'boosting_type': 'gbdt',
      'num_leaves': 100,
      'n_jobs': -1,
      'learning_rate': 0.1,
      'feature_fraction': 0.8,
      'bagging_fraction': 0.8,
      'verbose': -1
    }

    X['stock_id'] = X['stock_id'].astype(int)
            
    # Root mean squared percentage error weights
    train_weights = 1 / np.square(y)
    train_dataset = lgb.Dataset(X, y, weight = train_weights, categorical_feature = ['stock_id'])

    # Model definition
    model = lgb.train(params = params, 
                  train_set = train_dataset, 
                  num_boost_round = 50, 
                  verbose_eval = 50,
                  feval = feval_rmspe)
            
    return model

In [5]:
# Competition metric
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# Prediction function (chose here which prediction strategy to use)
def prediction_function(pred, machine, targets, all_stocks_ids, datapath, test, all_stocks_ids_test):
        
    if pred == 'entropy':
        if machine == 'local':
            # Load data
            df_features_encoded_test = computeFeatures_wEntropy(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids, datapath=datapath)
            df_features_encoded_train = computeFeatures_wEntropy(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath)
            X = df_features_encoded_train.drop(['row_id'],axis=1)
            y = targets
            
            # Model
            model = CatBoostRegressor(verbose=0)
            model.fit(X,y)
            
            # Predicting targets from same
            yhat = model.predict(X)
            
            print('New model catboost perf : ', rmspe(y, yhat))
            
            # Submission file
            yhat_pd = pd.DataFrame(yhat,columns=['target'])
            return pd.concat([df_features_encoded_train['row_id'],yhat_pd],axis=1)

        # Features computation
        df_features_encoded_test = computeFeatures_wEntropy(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids, datapath=datapath)
        df_features_encoded_train = computeFeatures_wEntropy(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath)
        
        # Training model
        X = df_features_encoded_train.drop(['row_id'],axis=1)
        y = targets
        
        # Optimized model
        model = CatBoostRegressor(verbose=0)
        model.fit(X,y)
        
        # Predicting targets from test
        X_test = df_features_encoded_test.drop(['row_id'],axis=1)
        yhat = model.predict(X_test)
        
        # Submission file
        yhat_pd = pd.DataFrame(yhat,columns=['target'])
        submission_file = pd.concat([df_features_encoded_test['row_id'],yhat_pd],axis=1)
        
    if pred == 'new_test_laurent':
        if machine == 'local':
            # Load data
            df_features_encoded_test = computeFeatures_newTest_Laurent(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids, datapath=datapath)
            df_features_encoded_train = computeFeatures_newTest_Laurent(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath)
            X = df_features_encoded_train.drop(['row_id'],axis=1)
            y = targets
            
            # Model
            model = CatBoostRegressor(verbose=0)
            model.fit(X,y)
            
            # Predicting targets from same
            yhat = model.predict(X)
            
            print('New model catboost perf : ', rmspe(y, yhat))
            
            # Submission file
            yhat_pd = pd.DataFrame(yhat,columns=['target'])
            return pd.concat([df_features_encoded_test['row_id'],yhat_pd],axis=1)

        # Features computation
        df_features_encoded_test = computeFeatures_newTest_Laurent(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids, datapath=datapath)
        df_features_encoded_train = computeFeatures_newTest_Laurent(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath)
        
        # Training model
        X = df_features_encoded_train.drop(['row_id'],axis=1)
        y = targets
        
        # Optimized model
        model = CatBoostRegressor(verbose=0)
        model.fit(X,y)
        
        # Predicting targets from test
        X_test = df_features_encoded_test.drop(['row_id'],axis=1)
        yhat = model.predict(X_test)
        
        # Submission file
        yhat_pd = pd.DataFrame(yhat,columns=['target'])
        submission_file = pd.concat([df_features_encoded_test['row_id'],yhat_pd],axis=1)
        
    if pred == 'new_test_laurent_withoutEncoding':
        if machine == 'local':
            # Load data
            df_features_encoded_test = computeFeatures_newTest_Laurent_noCode(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids, datapath=datapath)
            df_features_encoded_train = computeFeatures_newTest_Laurent_noCode(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath)
            X = df_features_encoded_train.drop(['row_id'],axis=1)
            y = targets
            
            # Model
            model = CatBoostRegressor(verbose=0)
            model.fit(X,y)
            
            # Predicting targets from same
            yhat = model.predict(X)
            
            print('New model catboost perf : ', rmspe(y, yhat))
            
            # Submission file
            yhat_pd = pd.DataFrame(yhat,columns=['target'])
            return pd.concat([df_features_encoded_test['row_id'],yhat_pd],axis=1)

        # Features computation
        df_features_encoded_test = computeFeatures_newTest_Laurent_noCode(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids, datapath=datapath)
        df_features_encoded_train = computeFeatures_newTest_Laurent_noCode(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath)
        
        # Training model
        X = df_features_encoded_train.drop(['row_id'],axis=1)
        y = targets
        
        # Optimized model
        model = CatBoostRegressor(verbose=0)
        model.fit(X,y)
        
        # Predicting targets from test
        X_test = df_features_encoded_test.drop(['row_id'],axis=1)
        yhat = model.predict(X_test)
        
        # Submission file
        yhat_pd = pd.DataFrame(yhat,columns=['target'])
        submission_file = pd.concat([df_features_encoded_test['row_id'],yhat_pd],axis=1)
        
    if pred == 'new_test_laurent_withoutEncoding_wTrades':
        if machine == 'local':
            # Load data
            df_features_encoded_test = computeFeatures_newTest_Laurent_wTrades(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids, datapath=datapath)
            df_features_encoded_train = computeFeatures_newTest_Laurent_wTrades(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath)
            X = df_features_encoded_train.drop(['row_id'],axis=1)
            y = targets
            
            # Model
            model = CatBoostRegressor(verbose=0)
            model.fit(X,y)
            
            # Predicting targets from same
            yhat = model.predict(X)
            
            print('New model catboost perf : ', rmspe(y, yhat))
            
            # Submission file
            yhat_pd = pd.DataFrame(yhat,columns=['target'])
            return pd.concat([df_features_encoded_test['row_id'],yhat_pd],axis=1)

        # Features computation
        df_features_encoded_test = computeFeatures_newTest_Laurent_wTrades(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids, datapath=datapath)
        df_features_encoded_train = computeFeatures_newTest_Laurent_wTrades(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath)
        
        # Training model
        X = df_features_encoded_train.drop(['row_id'],axis=1)
        y = targets
        
        # Optimized model
        model = CatBoostRegressor(verbose=0)
        model.fit(X,y)
        
        # Predicting targets from test
        X_test = df_features_encoded_test.drop(['row_id'],axis=1)
        yhat = model.predict(X_test)
        
        # Submission file
        yhat_pd = pd.DataFrame(yhat,columns=['target'])
        submission_file = pd.concat([df_features_encoded_test['row_id'],yhat_pd],axis=1)
        
        
    if pred == 'garch':
        
        if machine == 'local':
            book_path_train = glob.glob(os.path.join(datapath,'book_train.parquet','*')) 
            
            # fit garch and predict
            prediction = garch_volatility_per_stock(list_file=book_path_train, prediction_column_name='pred')
            
            # Merge and evaluate results
            prediction = train.merge(prediction[['row_id','pred']], on = ['row_id'], how = 'left')
            prediction = prediction[prediction.pred.notnull()]

            # Estimate performances
            R2 = round(r2_score(y_true = prediction['target'], y_pred = prediction['pred']),3)
            RMSPE = round(rmspe(y_true = prediction['target'], y_pred = prediction['pred']),3)

            print('--')
            print(f'Performance of prediction: R2 score: {R2}, RMSPE: {RMSPE}')

            prediction = prediction.drop(columns=['target'])
            prediction = prediction.rename(columns={'pred': 'target'})
            
            return prediction
        

    if pred == 'test_2807':
        if machine == 'local':

            # Load data
            df_features_test = computeFeatures_2807(machine=machine, dataset='test', all_stocks_ids=[0], datapath=datapath).fillna(0)
            df_features_train = computeFeatures_2807(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath).fillna(0)

            # saving features to the .csv file
            print('['+time.strftime('%X')+']', 'Saving features to .csv files ...') # print also time
            df_features_train.to_csv('df_features_train.csv')
            df_features_test.to_csv('df_features_test.csv')
                              
            # Modelling
            start = time.time()
            print('['+time.strftime('%X')+']', 'Model Training on splits...')
            
            # Model list
            #list_rmspe = train_CatBoost_cv(df_features_train=df_features_train,targets=targets,splits=10)
            list_rmspe = train_lgbm_cv(df_features_train=df_features_train,targets=targets,splits=10)
            
            print('['+time.strftime('%X')+']', 'Training on splits took ',  time.time() - start, 'seconds')
            
            # Print results
            print(list_rmspe)
            print('Mean of RMSPE : ', np.mean(np.array(list_rmspe)), ' +- ', np.std(np.array(list_rmspe)))
            
            return df_features_train # Returns the feature in local mode for further use

        # Features computation
        df_features_test = computeFeatures_2807(machine=machine, dataset='test', all_stocks_ids=all_stocks_ids_test, datapath=datapath).fillna(0)
        df_features_test = test.merge(df_features_test, on = ['row_id'], how = 'left') # Should ensure order of predictions
        df_features_train = computeFeatures_2807(machine=machine, dataset='train', all_stocks_ids=all_stocks_ids, datapath=datapath).fillna(0)
        rows_id_to_merge = df_features_test['row_id'].copy()
        
        # Add other stocks volatility at same time id and this stock overall volatility
        df_features_test = get_time_stock(df_features_test).drop(['row_id','time_id'],axis=1)
        df_features_test['stock_id'] = df_features_test['stock_id'].astype(int)
        df_features_train = get_time_stock(df_features_train).drop(['row_id','time_id'],axis=1)
        
        # Optimized model
        model = train_lgbm(df_features_train=df_features_train, targets=targets)
        
        # Predicting targets from test
        yhat = model.predict(df_features_test)
        
        # Submission file
        yhat_pd = pd.DataFrame(yhat,columns=['target'])
        submission_file = pd.concat([rows_id_to_merge,yhat_pd],axis=1)    

    return submission_file

**Submission**

In [None]:
# New sub
import warnings
warnings.filterwarnings('ignore')
df_submission = prediction_function(pred='test_2807',machine=machine,targets=train['target'],all_stocks_ids=all_stocks_ids, datapath=datapath, test=test, all_stocks_ids_test=all_stocks_ids_test)
# if machine == 'kaggle':
#     df_submission.to_csv('submission.csv',index=False)
# else:
#     df_submission.iloc[0:10,:].to_csv('features_train_head.csv')

In [6]:
features_temp = pd.read_csv('df_features_train.csv')
del features_temp['Unnamed: 0']
features_temp['stock_id'] = features_temp['row_id'].apply(lambda x: int(x.split('-')[0]))
features_temp['time_id']  = features_temp['row_id'].apply(lambda x: int(x.split('-')[1]))
del features_temp['row_id']
features_temp

Unnamed: 0,wap_sum,wap_mean,wap_std,mid_price_sum,mid_price_mean,mid_price_std,log_return1_sum,log_return1_realized_volatility,log_return1_mean,log_return1_std,...,trade_size_sum,trade_order_count_sum,trade_order_count_mean,trade_roll_measure,trade_roll_impact,trade_amihud,trade_traded_volume,trade_avg_trade_size,stock_id,time_id
0,303.125061,1.003725,0.000693,303.129970,1.003742,0.000589,0.002292,0.004499,7.613599e-06,0.000260,...,3179.0,110.0,2.750000,0.000204,6.382619e-08,4.216972e-07,3190.139181,28.900000,0,5
1,200.047768,1.000239,0.000262,200.041690,1.000209,0.000241,0.000360,0.001204,1.810239e-06,0.000086,...,1289.0,57.0,1.900000,0.000463,3.589880e-07,6.228642e-07,1289.353432,22.614035,0,11
2,187.913849,0.999542,0.000864,187.918460,0.999566,0.000746,-0.002074,0.002369,-1.109201e-05,0.000173,...,2161.0,68.0,2.720000,0.000407,1.886683e-07,1.175238e-06,2158.608928,31.779412,0,16
3,119.859781,0.998832,0.000757,119.864975,0.998875,0.000616,-0.002828,0.002574,-2.376661e-05,0.000236,...,1962.0,59.0,3.933333,0.000738,3.764258e-07,1.152450e-06,1959.605547,33.254237,0,31
4,175.932865,0.999619,0.000258,175.930570,0.999606,0.000194,-0.000002,0.001894,-1.057099e-08,0.000144,...,1791.0,89.0,4.045455,0.000362,2.023833e-07,1.531390e-07,1790.254496,20.123596,0,62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,309.870466,0.999582,0.000486,309.824550,0.999434,0.000465,-0.000527,0.003691,-1.706835e-06,0.000210,...,2570.0,103.0,2.783784,0.000244,9.479032e-08,4.704802e-07,2568.838117,24.951456,126,32751
428928,223.552143,1.002476,0.001264,223.528810,1.002371,0.001278,0.004436,0.004104,1.998029e-05,0.000275,...,2323.0,147.0,3.418605,0.000497,2.134959e-07,2.091255e-06,2327.828627,15.802721,126,32753
428929,256.277050,1.001082,0.000466,256.211550,1.000826,0.000455,0.001525,0.003118,5.979199e-06,0.000196,...,3740.0,98.0,2.800000,0.000776,2.072699e-07,3.455508e-07,3742.254714,38.163265,126,32758
428930,399.721736,1.001809,0.000456,399.748570,1.001876,0.000410,0.000256,0.003661,6.429922e-07,0.000184,...,9389.0,234.0,2.925000,0.000184,1.950722e-08,2.399810e-08,9406.795437,40.123932,126,32763


## Add clusters to the `features_temp`

In [7]:
# read files
stock_clusters = pd.read_csv('stock_clusters.csv')
time_clusters = pd.read_csv('time_clusters.csv')

# merge the tables into features_temp
features_temp = pd.merge(features_temp, stock_clusters, on='stock_id', how='left')
features_temp = pd.merge(features_temp, time_clusters, on='time_id', how='left')

In [8]:
# normalize the data (only for appropriate columns)
from sklearn import preprocessing

data = features_temp.loc[:, features_temp.columns[:-8]]
scaler = preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)
features_scaled = pd.concat([pd.DataFrame(data_scaled, columns=data.columns), features_temp.loc[:,features_temp.columns[-8:]]], axis=1)
features_scaled

Unnamed: 0,wap_sum,wap_mean,wap_std,mid_price_sum,mid_price_mean,mid_price_std,log_return1_sum,log_return1_realized_volatility,log_return1_mean,log_return1_std,...,trade_traded_volume,trade_avg_trade_size,stock_id,time_id,km_x,hc_x,ha_x,km_y,hc_y,ha_y
0,-0.639031,1.104316,-0.396660,-0.639001,1.111899,-0.469451,0.619973,0.074238,0.705678,0.150592,...,-0.408029,-0.469105,0,5,2,1,1,3,1,1
1,-1.397873,0.069885,-0.806592,-1.397924,0.060385,-0.803573,0.100188,-0.844596,0.169450,-0.693876,...,-0.435082,-0.542342,0,11,2,1,1,1,1,1
2,-1.487201,-0.136938,-0.233761,-1.487174,-0.130757,-0.319051,-0.554959,-0.519973,-1.022713,-0.268272,...,-0.422710,-0.435558,0,16,2,1,1,1,1,1
3,-1.988207,-0.347661,-0.336160,-1.988175,-0.336548,-0.444075,-0.757878,-0.462721,-2.193842,0.034539,...,-0.425542,-0.418375,0,31,2,1,1,1,1,1
4,-1.575404,-0.114151,-0.810551,-1.575427,-0.119067,-0.848177,0.002744,-0.652162,0.001207,-0.412300,...,-0.427953,-0.571358,0,62,2,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,-0.589373,-0.124951,-0.593530,-0.589716,-0.170105,-0.588372,-0.138693,-0.151275,-0.155527,-0.088900,...,-0.416872,-0.515109,126,32751,2,1,1,1,1,1
428928,-1.224837,0.733630,0.146578,-1.225015,0.704073,0.191491,1.196938,-0.036057,1.848355,0.226598,...,-0.420302,-0.621700,126,32753,2,1,1,1,1,1
428929,-0.983920,0.320113,-0.613047,-0.984409,0.244283,-0.598003,0.413562,-0.311090,0.554660,-0.160531,...,-0.400171,-0.361180,126,32758,2,1,1,1,1,1
428930,0.072101,0.535701,-0.621808,0.072293,0.556692,-0.640814,0.072112,-0.159511,0.061596,-0.217657,...,-0.319551,-0.338337,126,32763,2,1,1,1,1,1


## train the model with these features

In [22]:
my_list_rmspe = train_lgbm_cv(df_features_train=features_scaled,targets=train['target'],splits=10)
my_list_rmspe1 = train_lgbm_cv(df_features_train=features_temp,targets=train['target'],splits=10)
my_list_rmspe2 = train_lgbm_cv(df_features_train=features_temp.loc[:, features_temp.columns[:-6]],targets=train['target'],splits=10)



Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000454354	training's RMSPE: 0.210554	valid_1's rmse: 0.000506848	valid_1's RMSPE: 0.233354
[100]	training's rmse: 0.000421573	training's RMSPE: 0.195363	valid_1's rmse: 0.0005091	valid_1's RMSPE: 0.234391
Early stopping, best iteration is:
[67]	training's rmse: 0.000440445	training's RMSPE: 0.204108	valid_1's rmse: 0.000505266	valid_1's RMSPE: 0.232626




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000452985	training's RMSPE: 0.209564	valid_1's rmse: 0.000493733	valid_1's RMSPE: 0.228868
[100]	training's rmse: 0.000418779	training's RMSPE: 0.193739	valid_1's rmse: 0.000497153	valid_1's RMSPE: 0.230453
Early stopping, best iteration is:
[51]	training's rmse: 0.000452363	training's RMSPE: 0.209276	valid_1's rmse: 0.000493557	valid_1's RMSPE: 0.228786




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000454283	training's RMSPE: 0.209954	valid_1's rmse: 0.000497688	valid_1's RMSPE: 0.231618
[100]	training's rmse: 0.000419085	training's RMSPE: 0.193687	valid_1's rmse: 0.000503294	valid_1's RMSPE: 0.234228
Early stopping, best iteration is:
[55]	training's rmse: 0.000450512	training's RMSPE: 0.208212	valid_1's rmse: 0.000497301	valid_1's RMSPE: 0.231439




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000455758	training's RMSPE: 0.209408	valid_1's rmse: 0.000484983	valid_1's RMSPE: 0.230822
[100]	training's rmse: 0.000423108	training's RMSPE: 0.194406	valid_1's rmse: 0.000487984	valid_1's RMSPE: 0.23225
Early stopping, best iteration is:
[56]	training's rmse: 0.000451184	training's RMSPE: 0.207307	valid_1's rmse: 0.000484151	valid_1's RMSPE: 0.230426




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000453491	training's RMSPE: 0.210648	valid_1's rmse: 0.000504916	valid_1's RMSPE: 0.230237
[100]	training's rmse: 0.000418886	training's RMSPE: 0.194574	valid_1's rmse: 0.000507258	valid_1's RMSPE: 0.231305
Early stopping, best iteration is:
[58]	training's rmse: 0.000445904	training's RMSPE: 0.207124	valid_1's rmse: 0.000503751	valid_1's RMSPE: 0.229706




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000447329	training's RMSPE: 0.208822	valid_1's rmse: 0.0005252	valid_1's RMSPE: 0.234462
[100]	training's rmse: 0.000416148	training's RMSPE: 0.194266	valid_1's rmse: 0.000527637	valid_1's RMSPE: 0.23555
Early stopping, best iteration is:
[52]	training's rmse: 0.00044605	training's RMSPE: 0.208225	valid_1's rmse: 0.000524718	valid_1's RMSPE: 0.234247




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000454698	training's RMSPE: 0.210448	valid_1's rmse: 0.000494016	valid_1's RMSPE: 0.2286
[100]	training's rmse: 0.00042102	training's RMSPE: 0.194862	valid_1's rmse: 0.000497386	valid_1's RMSPE: 0.230159
Early stopping, best iteration is:
[51]	training's rmse: 0.000453642	training's RMSPE: 0.20996	valid_1's rmse: 0.000493905	valid_1's RMSPE: 0.228549




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000453184	training's RMSPE: 0.209978	valid_1's rmse: 0.000501883	valid_1's RMSPE: 0.231216
Early stopping, best iteration is:
[48]	training's rmse: 0.000454632	training's RMSPE: 0.210649	valid_1's rmse: 0.000501681	valid_1's RMSPE: 0.231123




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000452747	training's RMSPE: 0.210214	valid_1's rmse: 0.000502224	valid_1's RMSPE: 0.229408
Early stopping, best iteration is:
[41]	training's rmse: 0.000462257	training's RMSPE: 0.214629	valid_1's rmse: 0.000500963	valid_1's RMSPE: 0.228832




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000455611	training's RMSPE: 0.210137	valid_1's rmse: 0.000500218	valid_1's RMSPE: 0.234666
[100]	training's rmse: 0.00042116	training's RMSPE: 0.194248	valid_1's rmse: 0.000505339	valid_1's RMSPE: 0.237068
Early stopping, best iteration is:
[54]	training's rmse: 0.00045161	training's RMSPE: 0.208292	valid_1's rmse: 0.000499978	valid_1's RMSPE: 0.234553




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000454414	training's RMSPE: 0.210582	valid_1's rmse: 0.000508292	valid_1's RMSPE: 0.234019
[100]	training's rmse: 0.000421496	training's RMSPE: 0.195327	valid_1's rmse: 0.000511172	valid_1's RMSPE: 0.235345
Early stopping, best iteration is:
[59]	training's rmse: 0.000446698	training's RMSPE: 0.207006	valid_1's rmse: 0.000507589	valid_1's RMSPE: 0.233695




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000453033	training's RMSPE: 0.209586	valid_1's rmse: 0.000493497	valid_1's RMSPE: 0.228759
Early stopping, best iteration is:
[46]	training's rmse: 0.000457022	training's RMSPE: 0.211432	valid_1's rmse: 0.000492904	valid_1's RMSPE: 0.228484




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000455511	training's RMSPE: 0.210522	valid_1's rmse: 0.000496922	valid_1's RMSPE: 0.231262
Early stopping, best iteration is:
[49]	training's rmse: 0.000457268	training's RMSPE: 0.211334	valid_1's rmse: 0.000495438	valid_1's RMSPE: 0.230572




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.00045704	training's RMSPE: 0.209997	valid_1's rmse: 0.000485469	valid_1's RMSPE: 0.231053
[100]	training's rmse: 0.000422327	training's RMSPE: 0.194047	valid_1's rmse: 0.000489111	valid_1's RMSPE: 0.232787
Early stopping, best iteration is:
[58]	training's rmse: 0.000450252	training's RMSPE: 0.206878	valid_1's rmse: 0.000485095	valid_1's RMSPE: 0.230875




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000453857	training's RMSPE: 0.210818	valid_1's rmse: 0.000507178	valid_1's RMSPE: 0.231268
[100]	training's rmse: 0.00042089	training's RMSPE: 0.195505	valid_1's rmse: 0.000510569	valid_1's RMSPE: 0.232814
Early stopping, best iteration is:
[59]	training's rmse: 0.000445735	training's RMSPE: 0.207045	valid_1's rmse: 0.000506873	valid_1's RMSPE: 0.231129




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000447752	training's RMSPE: 0.20902	valid_1's rmse: 0.000527873	valid_1's RMSPE: 0.235656
Early stopping, best iteration is:
[45]	training's rmse: 0.000453131	training's RMSPE: 0.21153	valid_1's rmse: 0.000526619	valid_1's RMSPE: 0.235096




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000454193	training's RMSPE: 0.210215	valid_1's rmse: 0.000495845	valid_1's RMSPE: 0.229446
[100]	training's rmse: 0.000423163	training's RMSPE: 0.195853	valid_1's rmse: 0.000498	valid_1's RMSPE: 0.230443
Early stopping, best iteration is:
[50]	training's rmse: 0.000454193	training's RMSPE: 0.210215	valid_1's rmse: 0.000495845	valid_1's RMSPE: 0.229446




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000452004	training's RMSPE: 0.209432	valid_1's rmse: 0.000502458	valid_1's RMSPE: 0.231481
[100]	training's rmse: 0.000418417	training's RMSPE: 0.193869	valid_1's rmse: 0.000505932	valid_1's RMSPE: 0.233082
Early stopping, best iteration is:
[56]	training's rmse: 0.000447419	training's RMSPE: 0.207307	valid_1's rmse: 0.000502113	valid_1's RMSPE: 0.231322




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.00045203	training's RMSPE: 0.209881	valid_1's rmse: 0.000502412	valid_1's RMSPE: 0.229494
[100]	training's rmse: 0.000417347	training's RMSPE: 0.193778	valid_1's rmse: 0.000507183	valid_1's RMSPE: 0.231673
Early stopping, best iteration is:
[50]	training's rmse: 0.00045203	training's RMSPE: 0.209881	valid_1's rmse: 0.000502412	valid_1's RMSPE: 0.229494




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000456042	training's RMSPE: 0.210336	valid_1's rmse: 0.000498329	valid_1's RMSPE: 0.233779
[100]	training's rmse: 0.000420195	training's RMSPE: 0.193803	valid_1's rmse: 0.000501922	valid_1's RMSPE: 0.235465
Early stopping, best iteration is:
[52]	training's rmse: 0.000454554	training's RMSPE: 0.20965	valid_1's rmse: 0.000498027	valid_1's RMSPE: 0.233638




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000455949	training's RMSPE: 0.211293	valid_1's rmse: 0.000513567	valid_1's RMSPE: 0.236448
[100]	training's rmse: 0.000421443	training's RMSPE: 0.195302	valid_1's rmse: 0.000513366	valid_1's RMSPE: 0.236355
Early stopping, best iteration is:
[75]	training's rmse: 0.00043607	training's RMSPE: 0.20208	valid_1's rmse: 0.000510599	valid_1's RMSPE: 0.235081




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000455307	training's RMSPE: 0.210638	valid_1's rmse: 0.000492729	valid_1's RMSPE: 0.228402
[100]	training's rmse: 0.000420848	training's RMSPE: 0.194696	valid_1's rmse: 0.000498428	valid_1's RMSPE: 0.231044
Early stopping, best iteration is:
[58]	training's rmse: 0.000448081	training's RMSPE: 0.207296	valid_1's rmse: 0.000492358	valid_1's RMSPE: 0.22823




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000456235	training's RMSPE: 0.210857	valid_1's rmse: 0.000498049	valid_1's RMSPE: 0.231787
Early stopping, best iteration is:
[47]	training's rmse: 0.000459741	training's RMSPE: 0.212477	valid_1's rmse: 0.000497845	valid_1's RMSPE: 0.231692




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000458152	training's RMSPE: 0.210508	valid_1's rmse: 0.000488198	valid_1's RMSPE: 0.232352
[100]	training's rmse: 0.000422583	training's RMSPE: 0.194165	valid_1's rmse: 0.000490523	valid_1's RMSPE: 0.233459
Early stopping, best iteration is:
[58]	training's rmse: 0.00045162	training's RMSPE: 0.207507	valid_1's rmse: 0.00048733	valid_1's RMSPE: 0.231939




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000453385	training's RMSPE: 0.210599	valid_1's rmse: 0.000513246	valid_1's RMSPE: 0.234035
Early stopping, best iteration is:
[46]	training's rmse: 0.000458723	training's RMSPE: 0.213078	valid_1's rmse: 0.000512256	valid_1's RMSPE: 0.233584




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000449014	training's RMSPE: 0.209609	valid_1's rmse: 0.000524824	valid_1's RMSPE: 0.234295
[100]	training's rmse: 0.000416931	training's RMSPE: 0.194632	valid_1's rmse: 0.000531939	valid_1's RMSPE: 0.237471
Early stopping, best iteration is:
[50]	training's rmse: 0.000449014	training's RMSPE: 0.209609	valid_1's rmse: 0.000524824	valid_1's RMSPE: 0.234295




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000453873	training's RMSPE: 0.210067	valid_1's rmse: 0.000495339	valid_1's RMSPE: 0.229212
Early stopping, best iteration is:
[48]	training's rmse: 0.000456097	training's RMSPE: 0.211096	valid_1's rmse: 0.000495244	valid_1's RMSPE: 0.229168




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.00045198	training's RMSPE: 0.209421	valid_1's rmse: 0.000505214	valid_1's RMSPE: 0.232751
Early stopping, best iteration is:
[44]	training's rmse: 0.00045951	training's RMSPE: 0.212909	valid_1's rmse: 0.000503609	valid_1's RMSPE: 0.232011




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000451597	training's RMSPE: 0.20968	valid_1's rmse: 0.000505072	valid_1's RMSPE: 0.230708
Early stopping, best iteration is:
[42]	training's rmse: 0.000460868	training's RMSPE: 0.213985	valid_1's rmse: 0.000503648	valid_1's RMSPE: 0.230058




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000455734	training's RMSPE: 0.210194	valid_1's rmse: 0.000503493	valid_1's RMSPE: 0.236202
[100]	training's rmse: 0.00042066	training's RMSPE: 0.194017	valid_1's rmse: 0.00050535	valid_1's RMSPE: 0.237073
Early stopping, best iteration is:
[54]	training's rmse: 0.00045155	training's RMSPE: 0.208264	valid_1's rmse: 0.00050315	valid_1's RMSPE: 0.236041


In [32]:
print('Mean of RMSPE (initial) : ', np.mean(np.array(my_list_rmspe2)), u'\u00B1', np.std(np.array(my_list_rmspe2)))
print('Mean of RMSPE (initial + clusters) : ', np.mean(np.array(my_list_rmspe1)), u'\u00B1', np.std(np.array(my_list_rmspe1)))
print('Mean of RMSPE (normalized + clusters) : ', np.mean(np.array(my_list_rmspe)), u'\u00B1', np.std(np.array(my_list_rmspe)))

Mean of RMSPE (initial) :  0.23220997646978137 ± 0.002434308917999223
Mean of RMSPE (initial + clusters) :  0.23137505065271177 ± 0.0020230195312899543
Mean of RMSPE (normalized + clusters) :  0.23102856725662052 ± 0.0020886798188772646
