In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from collections import OrderedDict

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('poster')

from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
import xgboost as xgb
import lightgbm as lgb
import random

# Importing from my own modules
import sys
sys.path.append('../financial_forecasting/')
from utils import load_data, wMSE, train_and_test_models



# Load data

In [2]:
X_train = pd.read_csv('../data/preprocessed/train.csv')
X_val = pd.read_csv('../data/preprocessed/validation.csv')
X_test = pd.read_csv('../data/preprocessed/test.csv')

weights_train = pd.read_csv('../data/preprocessed/train_weights.csv', squeeze=True)
weights_val = pd.read_csv('../data/preprocessed/validation_weights.csv', squeeze=True)

y_train = pd.read_csv('../data/preprocessed/train_target.csv', squeeze=True)
y_val = pd.read_csv('../data/preprocessed/validation_target.csv', squeeze=True)

# Randomised search 

In [3]:
# Custom scoring function
def eval_error(y, y_pred, weights): 
    err = wMSE(preds=y_pred, y=y, weights=weights)
    return err

my_new_score = make_scorer(eval_error, greater_is_better=False)

In [4]:
random.seed(42)

In [5]:
cv_params = {'max_depth': [3,5,7], 
             'min_child_weight': [1,3,5],
             'gamma': 10.**(-np.random.rand(15)*5)}

In [7]:
model = lgb.LGBMRegressor(n_estimators=1)

In [8]:
RCV = RandomizedSearchCV(model, 
                         cv_params, 
                         scoring = my_new_score, 
                         cv = 2, 
                         n_jobs = 2,
                         n_iter=1) 

In [None]:
RCV.fit(X_train, y_train, verbose=1)

# Manual search

Due to slow machine I experiment with some hyperparamter tuning by hand, for fun mostly. I manage to improve the model.

In [38]:
feats= ['Day', 'Market', 'Market_mean_encoded', 'Stock', 'Stock_mean_encoded',
       'x0', 'x0_log10', 'x0_log10_diff', 'x1_log10', 'x1_log10_diff',
       'x2_log10', 'x2_log10_diff', 'x3A', 'x3A_log10', 'x3A_log10_diff',
       'x3B', 'x3B_binned', 'x3B_log10', 'x3C', 'x3C_log10', 'x3D',
       'x3D_log10', 'x3D_log10_diff', 'x3E', 'x3E_log10', 'x3E_log10_diff',
       'x4', 'x4_binned', 'x4_log10_diff', 'x5', 'x5_binned', 'x5_log10',
       'x5_log10_diff', 'x6', 'x6_binned', 'x6_log10_diff']

In [40]:
enabled_vars = feats

models = OrderedDict([
                          ('xgboost_reg1', xgb.sklearn.XGBRegressor(n_estimators=700, n_jobs=-1, reg_lambda=10.0, max_depth=5,grow_policy='lossguide', tree_method='hist')), 
                          ('xgboost_reg2', xgb.sklearn.XGBRegressor(n_estimators=700, n_jobs=-1, max_depth=5, min_child_weight=3, grow_policy='lossguide', tree_method='hist')), 
                          ('xgboost_reg3', xgb.sklearn.XGBRegressor(n_estimators=700, n_jobs=-1, max_depth=5, min_child_weight=6, grow_policy='lossguide', tree_method='hist')), 
                          ('xgboost_reg4', xgb.sklearn.XGBRegressor(n_estimators=700, n_jobs=-1, max_depth=5, min_child_weight=10, grow_policy='lossguide', tree_method='hist')), 
                    ])

df_preds_train, df_preds_test, train_error, test_error = train_and_test_models(models, 
                                                         X_train.loc[:,enabled_vars], y_train, 
                                                         X_val.loc[:,enabled_vars], y_val, 
                                                         weights_train, weights_val)

Fitting: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, grow_policy='lossguide',
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=700, n_jobs=-1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=10.0, scale_pos_weight=1, seed=None, silent=True,
       subsample=1, tree_method='hist')
Train error: 8.314701647653596e-07 Test error: 9.790427422658719e-07 

Fitting: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, grow_policy='lossguide',
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=3, missing=None, n_estimators=700, n_jobs=-1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1, tree_method='hist')
Train error: 8.020348082212422e-07 Tes

In [44]:
enabled_vars = feats

models = OrderedDict([
                          ('xgboost_reg0', xgb.sklearn.XGBRegressor(n_estimators=800, n_jobs=-1, reg_lambda=10.0, max_depth=5, min_child_weight=10, grow_policy='lossguide', tree_method='hist')), 
                          ('xgboost_reg1', xgb.sklearn.XGBRegressor(n_estimators=700, n_jobs=-1, reg_lambda=10.0, max_depth=5, min_child_weight=10, grow_policy='lossguide', tree_method='hist')), 
                          ('xgboost_reg2', xgb.sklearn.XGBRegressor(n_estimators=600, n_jobs=-1, reg_lambda=10.0, max_depth=5, min_child_weight=10, grow_policy='lossguide', tree_method='hist')), 
                          ('xgboost_reg3', xgb.sklearn.XGBRegressor(n_estimators=500, n_jobs=-1, reg_lambda=10.0, max_depth=5, min_child_weight=10, grow_policy='lossguide', tree_method='hist')), 
                    ])

df_preds_train, df_preds_test, train_error, test_error = train_and_test_models(models, 
                                                         X_train.loc[:,enabled_vars], y_train, 
                                                         X_val.loc[:,enabled_vars], y_val, 
                                                         weights_train, weights_val)

Fitting: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, grow_policy='lossguide',
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=10, missing=None, n_estimators=800, n_jobs=-1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=10.0, scale_pos_weight=1, seed=None, silent=True,
       subsample=1, tree_method='hist')
Train error: 8.235247638767495e-07 Test error: 9.80718041617954e-07 

Fitting: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, grow_policy='lossguide',
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=10, missing=None, n_estimators=700, n_jobs=-1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=10.0, scale_pos_weight=1, seed=None, silent=True,
       subsample=1, tree_method='hist')
Train error: 8.235247638767495e-07