In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict
sns.set_context('poster')

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.externals import joblib
import xgboost as xgb
import lightgbm as lgb

# Importing from my own modules
import sys
sys.path.append('../financial_forecasting/')
from utils import load_data, wMSE, train_and_test_models
from preprocessing import Imputer, LogTransformer, MeanEncoder, TreeBinner, transform_to_embedding_vec



# Load data

In [2]:
X_train = pd.read_csv('../data/preprocessed/train.csv')
X_val = pd.read_csv('../data/preprocessed/validation.csv')
X_test = pd.read_csv('../data/preprocessed/test.csv')

weights_train = pd.read_csv('../data/preprocessed/train_weights.csv', squeeze=True)
weights_val = pd.read_csv('../data/preprocessed/validation_weights.csv', squeeze=True)

y_train = pd.read_csv('../data/preprocessed/train_target.csv', squeeze=True)
y_val = pd.read_csv('../data/preprocessed/validation_target.csv', squeeze=True)

# Load embeddings

In [3]:
emb_file = '../data/embeddings.pkl'
embedding_matrix = joblib.load(emb_file)

In [4]:
df_embeddings_train = transform_to_embedding_vec(X_train.Stock, embedding_matrix)
df_embeddings_val = transform_to_embedding_vec(X_val.Stock, embedding_matrix)
df_embeddings_test = transform_to_embedding_vec(X_test.Stock, embedding_matrix)

In [5]:
X_train = pd.concat([X_train, df_embeddings_train], axis=1)
X_val = pd.concat([X_val, df_embeddings_val], axis=1)
X_test = pd.concat([X_test, df_embeddings_test], axis=1)

# Train and evaluate models using embeddings

In [9]:
embedding_feats = list(df_embeddings_train.columns)

In [10]:
feats_no_embeddings = ['Day', 'Market', 'Market_mean_encoded', 'Stock', 'Stock_mean_encoded',
                       'x0', 'x0_log10', 'x0_log10_diff', 'x1_log10', 'x1_log10_diff',
                       'x2_log10', 'x2_log10_diff', 'x3A', 'x3A_log10', 'x3A_log10_diff',
                       'x3B', 'x3B_binned', 'x3B_log10', 'x3C', 'x3C_log10', 'x3D',
                       'x3D_log10', 'x3D_log10_diff', 'x3E', 'x3E_log10', 'x3E_log10_diff',
                       'x4', 'x4_binned', 'x4_log10_diff', 'x5', 'x5_binned', 'x5_log10',
                       'x5_log10_diff', 'x6', 'x6_binned', 'x6_log10_diff']

feats_with_embeddings = feats_no_embeddings + embedding_feats

In [8]:
enabled_vars = feats_no_embeddings

models = OrderedDict([
                      ('lgboost_reg', lgb.sklearn.LGBMRegressor(n_estimators=350, n_jobs=-1, max_depth=5, reg_lambda=10.0)),
                      ('lgboost', lgb.sklearn.LGBMRegressor(n_estimators=700)),
                    ])

df_preds_train, df_preds_test, train_error, test_error = train_and_test_models(models, 
                                                         X_train.loc[:,enabled_vars], y_train, 
                                                         X_val.loc[:,enabled_vars], y_val, 
                                                         weights_train, weights_val)

Fitting: LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=5, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=350,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=10.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
Train error: 8.207052447849935e-07 Test error: 9.681071629000013e-07 

Fitting: LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=700,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
Train error: 6.37821141588693e-07 Test error: 9.733154677963492e-07 



In [9]:
enabled_vars = feats_with_embeddings

models = OrderedDict([
                      ('lgboost_reg', lgb.sklearn.LGBMRegressor(n_estimators=350, n_jobs=-1, max_depth=5, reg_lambda=10.0)),
                      ('lgboost', lgb.sklearn.LGBMRegressor(n_estimators=700)),
                    ])

df_preds_train, df_preds_test, train_error, test_error = train_and_test_models(models, 
                                                         X_train.loc[:,enabled_vars], y_train, 
                                                         X_val.loc[:,enabled_vars], y_val, 
                                                         weights_train, weights_val)

Fitting: LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=5, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=350,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=10.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
Train error: 7.996524308749735e-07 Test error: 9.608319998945923e-07 

Fitting: LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=700,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
Train error: 6.16838249670288e-07 Test error: 9.630233338686736e-07 



Embeddings have definitely helped compared to using no embeddings. 

# Train final model

In [10]:
enabled_vars = feats_with_embeddings

models = OrderedDict([
                      ('xgboost', xgb.sklearn.XGBRegressor(n_estimators=700, n_jobs=-1)), 
                      ('lgboost_reg', lgb.sklearn.LGBMRegressor(n_estimators=350, n_jobs=-1, max_depth=5, reg_lambda=10.0)),
                      ('lgboost', lgb.sklearn.LGBMRegressor(n_estimators=700)),
                      ('xgboost_reg', xgb.sklearn.XGBRegressor(n_estimators=700, n_jobs=-1, reg_lambda=10.0, grow_policy='lossguide', tree_method='hist', max_depth=5)), 
                    ])

df_preds_train, df_preds_test, train_error, test_error = train_and_test_models(models, 
                                                         X_train.loc[:,enabled_vars], y_train, 
                                                         X_val.loc[:,enabled_vars], y_val, 
                                                         weights_train, weights_val)

Fitting: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=700,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Train error: 8.743937206761713e-07 Test error: 9.807254616703507e-07 

Fitting: LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=5, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=350,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=10.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
Train error: 7.996524308749755e-07 Test error: 9.608319728597496e-07 

Fitting: LGBMRegressor(boosting_type='gbdt', class_weight=None, 

In [11]:
# Model stacking
# Validation set
preds = (df_preds_test.xgboost_preds_test + 
         df_preds_test.lgboost_preds_test +
         df_preds_test.xgboost_reg_preds_test +
         df_preds_test.lgboost_reg_preds_test
        )/4
wMSE(preds=preds, y=y_val, weights=weights_val)

9.465412986335482e-07

Score on validation set with entity embeddings is better.

In [12]:
# Model stacking
# Test set
p1 = models['xgboost'].predict(X_test.loc[:, enabled_vars])
p2 = models['lgboost'].predict(X_test.loc[:, enabled_vars])
p3 = models['lgboost_reg'].predict(X_test.loc[:, enabled_vars])
p4 = models['xgboost_reg'].predict(X_test.loc[:, enabled_vars])

preds_test = pd.DataFrame(np.c_[p1,p2,p3,p4], columns=['xgb','lgb', 'lgbr', 'xgbr'], index=X_test.index)

preds_test['y'] = preds_test.mean(axis=1)

preds_test.head()

Unnamed: 0,xgb,lgb,lgbr,xgbr,y
0,0.000182,0.0003190553,0.0001898591,0.000291,0.000245
1,0.000127,-0.0001754083,-2.010227e-05,-7.7e-05,-3.6e-05
2,8e-06,7.171223e-07,7.538494e-09,-5e-06,1e-06
3,4.6e-05,3.947058e-05,3.604451e-05,3.9e-05,4e-05
4,0.000478,0.0005573888,0.0004676852,0.000532,0.000509


In [34]:
preds_test.y.to_csv('../data/output_13_lgb350reg_lgb700_xgb700_xgb700reg_with_log_diff_as_feats_with_param_tuning_entity_embeddings.csv', index=True, header=True)

# Train final model with all the data

In [6]:
X_train = pd.concat([X_train, X_val])
y_train = pd.concat([y_train, y_val])
weights_train = pd.concat([weights_train, weights_val])

In [7]:
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
weights_train.reset_index(drop=True, inplace=True)

In [11]:
enabled_vars = feats_with_embeddings

models = OrderedDict([
                      ('lgboost_reg', lgb.sklearn.LGBMRegressor(n_estimators=350, n_jobs=-1, max_depth=5, reg_lambda=10.0)),
                      ('lgboost', lgb.sklearn.LGBMRegressor(n_estimators=700)),
                      ('xgboost_reg', xgb.sklearn.XGBRegressor(n_estimators=700, n_jobs=-1, reg_lambda=10.0, grow_policy='lossguide', tree_method='hist', max_depth=5)), 
                      ('xgboost', xgb.sklearn.XGBRegressor(n_estimators=700, n_jobs=-1)),
                    ])

df_preds_train, df_preds_test, train_error, test_error = train_and_test_models(models, 
                                                         X_train.loc[:,enabled_vars], y_train, 
                                                         X_val.loc[:,enabled_vars], y_val, 
                                                         weights_train, weights_val)

Fitting: LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=5, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=350,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=10.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
Train error: 8.175027528219318e-07 Test error: 7.997841743201003e-07 

Fitting: LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=700,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
Train error: 6.5707960057007e-07 Test error: 6.425630901106511e-07 

Fitting: XGBRegressor(base_score=0.5, booster='gbtree', 

In [15]:
enabled_vars = feats_with_embeddings

models = OrderedDict([
                      ('xgboost', xgb.sklearn.XGBRegressor(n_estimators=700, n_jobs=-1)), 
                      ('lgboost_reg', lgb.sklearn.LGBMRegressor(n_estimators=350, n_jobs=-1, max_depth=5, reg_lambda=10.0)),
                      ('lgboost', lgb.sklearn.LGBMRegressor(n_estimators=700)),
                      ('xgboost_reg', xgb.sklearn.XGBRegressor(n_estimators=700, n_jobs=-1, reg_lambda=10.0, grow_policy='lossguide', tree_method='hist', max_depth=5)), 
                    ])

df_preds_train, df_preds_test, train_error, test_error = train_and_test_models(models, 
                                                         X_train.loc[:,enabled_vars], y_train, 
                                                         X_val.loc[:,enabled_vars], y_val, 
                                                         weights_train, weights_val)

Fitting: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=700,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Train error: 8.82849457463085e-07 Test error: 8.598960994759067e-07 

Fitting: LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=5, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=350,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=10.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
Train error: 8.224612228888496e-07 Test error: 8.045181874572624e-07 

Fitting: LGBMRegressor(boosting_type='gbdt', class_weight=None, c

In [13]:
# Model stacking
# Validation set
preds = (df_preds_test.xgboost_preds_test + 
         df_preds_test.lgboost_preds_test +
         df_preds_test.xgboost_reg_preds_test +
         df_preds_test.lgboost_reg_preds_test
        )/4
wMSE(preds=preds, y=y_val, weights=weights_val)

7.492324457561699e-07

In [14]:
# Model stacking
# Test set
p1 = models['xgboost'].predict(X_test.loc[:, enabled_vars])
p2 = models['lgboost'].predict(X_test.loc[:, enabled_vars])
p3 = models['lgboost_reg'].predict(X_test.loc[:, enabled_vars])
p4 = models['xgboost_reg'].predict(X_test.loc[:, enabled_vars])

preds_test = pd.DataFrame(np.c_[p1,p2,p3,p4], columns=['xgb','lgb', 'lgbr', 'xgbr'], index=X_test.index)

preds_test['y'] = preds_test.mean(axis=1)

preds_test.head()

Unnamed: 0,xgb,lgb,lgbr,xgbr,y
0,0.000221,0.000265,0.0002229333,0.000232,0.000235
1,0.0002,-0.000314,-0.0001486095,-0.000325,-0.000147
2,9e-06,3e-06,1.557063e-07,-4e-06,2e-06
3,4.1e-05,4e-05,3.327696e-05,3e-05,3.6e-05
4,0.00053,0.000519,0.0005088877,0.000511,0.000517


In [15]:
preds_test.y.to_csv('../data/output_14_lgb350reg_lgb700_xgb700_xgb700reg_with_log_diff_as_feats_with_param_tuning_entity_embeddings_all_data_2.csv', index=True, header=True)