In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
import re
import glob
from tqdm import tqdm
from typing import Optional, List

In [11]:
import scipy as sc
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 300)

In [6]:
train = pd.read_feather('preprocessing/vif_train_0')
test = pd.read_feather('preprocessing/X_test_0')

In [9]:
train.columns

Index(['row_id', 'stock_id', 'time_id', 'total_volume_std',
       'trade_log_return_realized_volatility', 'trade_order_count_mean',
       'trade_pct_change_price_gmean', 'trade_seconds_in_bucket_count_unique',
       'trade_size_sum', 'target'],
      dtype='object')

In [10]:
test.columns

Index(['row_id', 'stock_id', 'time_id', 'total_volume_std',
       'trade_log_return_realized_volatility', 'trade_order_count_mean',
       'trade_pct_change_price_gmean', 'trade_seconds_in_bucket_count_unique',
       'trade_size_sum'],
      dtype='object')

In [12]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False



In [13]:
def train_and_evaluate(train, test):
    # Hyperparammeters (just basic)
    params = {
      'objective': 'rmse',  
      'boosting_type': 'gbdt',
      'num_leaves': 100,
      'n_jobs': -1,
      'learning_rate': 0.1,
      'feature_fraction': 0.8,
      'bagging_fraction': 0.8,
      'verbose': -1
    }
    # Split features and target
    x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    x_test = test.drop(['row_id', 'time_id'], axis = 1)
    # Transform stock id to a numeric value
    x['stock_id'] = x['stock_id'].astype(int)
    x_test['stock_id'] = x_test['stock_id'].astype(int)
    
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    # Create test array to store predictions
    test_predictions = np.zeros(x_test.shape[0])
    # Create a KFold object
    kfold = KFold(n_splits = 5, random_state = 66, shuffle = True)
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
        val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
        model = lgb.train(params = params, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          num_boost_round = 10000, 
                          early_stopping_rounds = 50, 
                          verbose_eval = 50,
                          feval = feval_rmspe)
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val)
        # Predict the test set
        test_predictions += model.predict(x_test) / 5
        
    rmspe_score = rmspe(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    # Return test predictions
    return test_predictions


In [14]:
# Traing and evaluate
test_predictions = train_and_evaluate(train, test)
# Save test predictions
test['target'] = test_predictions
test[['row_id', 'target']].to_csv('submission.csv',index = False)

Training fold 1
Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000543362	training's RMSPE: 0.251662	valid_1's rmse: 0.00056542	valid_1's RMSPE: 0.260904
[100]	training's rmse: 0.000532481	training's RMSPE: 0.246622	valid_1's rmse: 0.000563993	valid_1's RMSPE: 0.260246
[150]	training's rmse: 0.00052535	training's RMSPE: 0.243319	valid_1's rmse: 0.000564799	valid_1's RMSPE: 0.260618
Early stopping, best iteration is:
[109]	training's rmse: 0.000531049	training's RMSPE: 0.245959	valid_1's rmse: 0.000563877	valid_1's RMSPE: 0.260192
Training fold 2
Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.000544025	training's RMSPE: 0.251648	valid_1's rmse: 0.000560814	valid_1's RMSPE: 0.260101
[100]	training's rmse: 0.000533016	training's RMSPE: 0.246556	valid_1's rmse: 0.000558826	valid_1's RMSPE: 0.25918
[150]	training's rmse: 0.000526244	training's RMSPE: 0.243423	valid_1's rmse: 0.000558849	valid_1's RMSPE: 0.25919
Early