In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict
import xgboost as xgb
import lightgbm as lgb
sns.set_context('poster')

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import train_test_split

# Importing from my own modules
import sys
sys.path.append('../financial_forecasting/')
from utils import load_data, wMSE, train_and_test_models
from preprocessing import Imputer, LogTransformer, MeanEncoder, compute_combined_variable, TreeBinner

# Load data

In [None]:
df_train, df_test = load_data()

In [None]:
X = df_train.drop(labels=['y','Weight'], axis=1)
y = df_train.y
weights = df_train.Weight
X_test = df_test

In [4]:
np.random.seed=42
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.30)

In [5]:
weights_train = weights[X_train.index]
weights_val = weights[X_val.index]

In [6]:
print(X_train.shape)
print(y_train.shape)

print(X_val.shape)
print(y_val.shape)

print(X_test.shape)

(436671, 14)
(436671,)
(187146, 14)
(187146,)
(640430, 14)


# Apply pre-processing

In [7]:
# Missing value imputation
imputer = Imputer()
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)

In [8]:
# Log transformation
features_to_log_trans = ['x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E', 'x4', 'x5', 'x6']
log_transformer = LogTransformer()
log_transformer.fit(X_train, features=features_to_log_trans)
X_train = log_transformer.transform(X_train)
X_val = log_transformer.transform(X_val)
X_test = log_transformer.transform(X_test)

  df[f + '_log10'] = np.log10(df[f])
  df[f + '_log10'] = np.log10(df[f])


In [26]:
print(len(X))
print(len(X_train)+len(X_val)+len(X_test))

1264247
1264247


In [36]:
# Compute log differences between current day and previous day for each stock
# Need to combine train, validation, and test for this
X_train['is_train_val_test'] = 0
X_val['is_train_val_test'] = 1
X_test['is_train_val_test'] = 2

X = pd.concat([X_train, X_val, X_test])
X = X.reset_index(drop=True)
X = compute_combined_variable(X, var1='Day', var2='Stock')
features = [feat + '_log10' for feat in df_train.columns if 'x' in feat]
for feat in features:
    x = X.groupby(['Day','Stock'])[feat].mean().unstack().diff().fillna(0).stack().reset_index()
    x = compute_combined_variable(x, var1='Day', var2='Stock')
    log_diff_dict = x.set_index('Day_Stock')[0].to_dict()
    X[feat + '_diff'] = X.loc[:, 'Day_Stock'].map(log_diff_dict)

In [37]:
# Split back into train, validation, and test
X_train = X[X.is_train_val_test == 0]
X_val = X[X.is_train_val_test == 1]
X_test = X[X.is_train_val_test == 2]

In [39]:
# Tree based binning
features_to_bin = features_to_log_trans.copy()
tree_binner = TreeBinner()
tree_binner.fit(X_train, y_train, weights_train, features=features_to_bin)
X_train = tree_binner.transform(X_train)
X_val = tree_binner.transform(X_val)
X_test = tree_binner.transform(X_test)

In [41]:
# Mean value encoding
features_to_encode = ['Market', 'Day', 'Stock']
encoder = MeanEncoder()
encoder.fit(X_train, features=features_to_encode, target=y_train)
X_train = encoder.transform(X_train)
X_val = encoder.transform(X_val)
X_test = encoder.transform(X_test)

Detected unseen values for encoding for feature Stock: {33.0, 111.0, 139.0, 387.0, 561.0, 1396.0, 1755.0, 1779.0, 1945.0, 1946.0, 1970.0, 2076.0, 2992.0}
Detected unseen values for encoding for feature Day: {1.0, 2.0, 5.0, 6.0, 7.0, 8.0, 9.0, 12.0, 13, 14, 15, 16, 19.0, 20.0, 21.0, 22.0, 23, 26, 27, 28, 29, 30.0, 33, 34, 35, 36, 37.0, 40, 41.0, 42, 43.0, 44.0, 47.0, 48.0, 49.0, 50, 51, 54.0, 55.0, 56.0, 57.0, 58.0, 61.0, 62.0, 63, 64.0, 65.0, 68.0, 69, 70, 71.0, 72, 75, 76, 77, 78.0, 79, 82.0, 83, 84, 85.0, 86.0, 89.0, 90, 91, 92.0, 93.0, 96, 97.0, 98.0, 99, 100, 103, 104.0, 105.0, 106, 110.0, 111.0, 112, 113.0, 114.0, 117.0, 118, 119, 120, 121, 124.0, 125.0, 126, 127.0, 128.0, 131.0, 132.0, 133, 134.0, 135.0, 138.0, 139, 140.0, 141, 142, 145.0, 146, 147.0, 148.0, 149, 152, 153.0, 154.0, 155, 156.0, 159, 160.0, 161.0, 162, 163, 166.0, 167.0, 168.0, 169.0, 170, 173.0, 174.0, 175, 176, 177, 180.0, 181, 182, 183.0, 184.0, 187.0, 188, 189, 190.0, 191, 194, 195.0, 196, 197.0, 198.0, 201.0, 

In [55]:
# Drop columns no longer needed
X_train.drop(labels=['is_train_val_test', 'Day_Stock'], axis=1, inplace=True)
X_val.drop(labels=['is_train_val_test', 'Day_Stock'], axis=1, inplace=True)
X_test.drop(labels=['is_train_val_test', 'Day_Stock'], axis=1, inplace=True)

# Output processed data

In [70]:
X_train.to_csv('../data/preprocessed/train.csv', index=False)
X_val.to_csv('../data/preprocessed/validation.csv', index=False)
X_test.to_csv('../data/preprocessed/test.csv', index=False)

weights_train.to_csv('../data/preprocessed/train_weights.csv', index=False)
weights_val.to_csv('../data/preprocessed/validation_weights.csv', index=False)

y_train.to_csv('../data/preprocessed/train_target.csv', index=False)
y_val.to_csv('../data/preprocessed/validation_target.csv', index=False)


In [57]:
# All the features
enabled_vars_trees = X_train.columns

models = OrderedDict([
                      ('lgboost1', lgb.LGBMRegressor(n_estimators=100, n_jobs=-1, learning_rate=0.1)), 
                    ])

# Remove zero feature importance features
df_preds_train, df_preds_test, train_error, test_error = train_and_test_models(models, 
                                                         X_train.loc[:,enabled_vars_trees], y_train, 
                                                         X_val.loc[:,enabled_vars_trees], y_val, 
                                                         weights_train, weights_val)

Fitting: LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
Train error: 0.5679668711518437 Test error: 0.6316990201723655 



In [61]:
feature_importance = pd.Series(models['lgboost1'].feature_importances_, index=enabled_vars_trees)
feature_importance = np.abs(feature_importance) / np.abs(feature_importance).sum()
print(feature_importance.sort_values(ascending=False))
non_zero_feature_importance = set(feature_importance[feature_importance > 1e-5].index)

x4                     0.072667
x0                     0.056667
x3E                    0.048000
x3D                    0.043000
Market                 0.038333
x3A_log10              0.035000
Day                    0.034333
x3D_log10              0.033667
x0_log10_diff          0.029333
Stock                  0.029333
x2                     0.029000
x5                     0.028667
x3A_log10_diff         0.028667
x3A                    0.026667
x1                     0.025000
Day_mean_encoded       0.024667
x3E_log10              0.024333
Market_mean_encoded    0.023667
x0_log10               0.023333
x2_log10_diff          0.022333
x4_log10_diff          0.020667
x5_log10_diff          0.020667
x5_log10               0.020667
x1_log10_diff          0.019667
x6                     0.019667
x3E_log10_diff         0.018667
Stock_mean_encoded     0.018333
x3D_log10_diff         0.017667
x6_log10               0.016667
x4_log10               0.016333
                         ...   
x4_binne

In [63]:
# All the features
enabled_vars_trees = non_zero_feature_importance

models = OrderedDict([
                      ('lgboost1', lgb.LGBMRegressor(n_estimators=100, n_jobs=-1, learning_rate=0.1)), 
                    ])

# Remove zero feature importance features
df_preds_train, df_preds_test, train_error, test_error = train_and_test_models(models, 
                                                         X_train.loc[:,enabled_vars_trees], y_train, 
                                                         X_val.loc[:,enabled_vars_trees], y_val, 
                                                         weights_train, weights_val)

Fitting: LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
Train error: 0.5679668711518502 Test error: 0.6316551247294814 



In [64]:
best_feats_so_far = ['Day',
 'Day_mean_encoded',
 'Market',
 'Market_mean_encoded',
 'Stock',
 'Stock_mean_encoded',
 'x0',
 'x0_log10',
 'x1',
 'x1_binned',
 'x2_log10',
 'x3A',
 'x3A_log10',
 'x3B',
 'x3C',
 'x3C_log10',
 'x3D',
 'x3D_log10',
 'x3E',
 'x3E_binned',
 'x3E_log10',
 'x4',
 'x4_binned',
 'x4_log10',
 'x5',
 'x5_log10',
 'x6',
 'x6_binned']

In [69]:
set(best_feats_so_far).issubset(set(non_zero_feature_importance))

True

In [65]:
# Best features found by hand
enabled_vars_trees = best_feats_so_far

models = OrderedDict([
                      ('lgboost1', lgb.LGBMRegressor(n_estimators=100, n_jobs=-1, learning_rate=0.1)), 
                    ])

# Remove zero feature importance features
df_preds_train, df_preds_test, train_error, test_error = train_and_test_models(models, 
                                                         X_train.loc[:,enabled_vars_trees], y_train, 
                                                         X_val.loc[:,enabled_vars_trees], y_val, 
                                                         weights_train, weights_val)

Fitting: LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)
Train error: 0.5812753524176768 Test error: 0.6290637746557141 



# Modelling

## Training and testing

### Null model

In [8]:
class NullModel():
    def __init__(self):
        self.stock_mean = None
    
    def fit(self, df):
        stock_mean = df.groupby('Stock')['y'].mean()
        self.stock_mean = stock_mean
        
    def predict(self, df):
        preds = df['Stock'].map(self.stock_mean).rename('y')
        preds.fillna(0)
        return preds

In [9]:
N = 436671
X_train_ = X_train.iloc[:N, :]
y_train_ =  y_train.iloc[:N]
weights_train_ = weights_train.iloc[:N]
df = pd.concat([X_train_, y_train_], axis=1)

clfs = [NullModel()]

for clf in clfs:
    # Training
    clf.fit(df)
        
    # Testing
    preds_train = clf.predict(X_train_)
    preds_val = clf.predict(X_val)
    preds_test = clf.predict(X_test)
    
    train_error = wMSE(preds=preds_train, y=y_train_, weights=weights_train_)
    val_error =  wMSE(preds=preds_val, y=y_val, weights=weights_val)
    scale = len(df_test)
    print('Train error: {} Test error: {} \n'.format(train_error * scale, val_error * scale))

Train error: 0.8252996810408122 Test error: 0.8398338690379339 



### Linear models

In [10]:
N = 436671
X_train_ = X_train_linear.iloc[:N, :]
y_train_ =  y_train.iloc[:N]
weights_train_ = weights_train.iloc[:N]

clfs = {'LinearRegression':LinearRegression(), 
        'L1Regression':Lasso(),
        'L2Regression':Ridge(alpha=1)}

preds_v = pd.DataFrame()
preds_t = pd.DataFrame()

for clf_name, clf in clfs.items():
    # Training
    print('Fitting: {}'.format(clf))
    try:
        clf.fit(X_train_, y_train_, sample_weight=weights_train_.values)
    except TypeError:
        print('{} does not accept sample weights'.format(clf))
        clf.fit(X_train_, y_train_)
        
    # Testing
    preds_train = clf.predict(X_train_)
    preds_val = clf.predict(X_val_linear)
    preds_test = clf.predict(X_test_linear)
    
    train_error = wMSE(preds=preds_train, y=y_train_, weights=weights_train_)
    val_error =  wMSE(preds=preds_val, y=y_val, weights=weights_val)
    scale = len(df_test)
    print('Train error: {} Test error: {} \n'.format(train_error * scale, val_error * scale))
    
    # Append test predictions to a dataframe
    data = {clf_name + '_preds_val': preds_val}
    df_preds_val = pd.DataFrame(data=data, index=X_val_linear.index)
    preds_v = pd.concat([preds_v, df_preds_val], axis=1)
    
    # Append test predictions to a dataframe
    data = {clf_name + '_preds_test': preds_test}
    df_preds_test = pd.DataFrame(data=data, index=X_test_linear.index)
    preds_t = pd.concat([preds_t, df_preds_test], axis=1)

Fitting: Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False) does not accept sample weights
Train error: 1.0626090284949343 Test error: 1.050829325626338 

Fitting: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
Train error: 0.7545601464126234 Test error: 0.7835720241122516 

Fitting: Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
Train error: 0.7615218321457947 Test error: 0.7743314016629029 



### Tree based models

In [11]:
N = 436671
X_train_ = X_train.iloc[:N, :]
y_train_ =  y_train.iloc[:N]
weights_train_ = weights_train.iloc[:N]

clfs = {'RF': RandomForestRegressor(n_estimators=100),
        'GBM': GradientBoostingRegressor()}
for clf_name, clf in clfs.items():
    # Training
    print('Fitting: {}'.format(clf))
    try:
        clf.fit(X_train_, y_train_, sample_weight=weights_train_.values)
    except TypeError:
        print('{} does not accept sample weights'.format(clf))
        clf.fit(X_train_, y_train_)
        
    # Testing
    preds_train = clf.predict(X_train_)
    preds_val = clf.predict(X_val)
    preds_test = clf.predict(X_test)
    
    train_error = wMSE(preds=preds_train, y=y_train_, weights=weights_train_)
    val_error =  wMSE(preds=preds_val, y=y_val, weights=weights_val)
    scale = len(df_test)
    print('Train error: {} Test error: {} \n'.format(train_error * scale, val_error * scale))
    
    # Append test predictions to a dataframe
    data = {clf_name + '_preds_val': preds_val}
    df_preds_val = pd.DataFrame(data=data, index=X_val.index)
    preds_v = pd.concat([preds_v, df_preds_val], axis=1)
    
    # Append test predictions to a dataframe
    data = {clf_name + '_preds_test': preds_test}
    df_preds_test = pd.DataFrame(data=data, index=X_test.index)
    preds_t = pd.concat([preds_t, df_preds_test], axis=1)

Fitting: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
Train error: 0.6508472707408143 Test error: 0.807733522967682 

Fitting: GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)
Train error: 0.9284558477597071 Test error: 0.9199019124950226 



### Model Stacking

In [16]:
preds_v['simple_average'] = (preds_v.L2Regression_preds_val + preds_v.RF_preds_val + preds_v.LinearRegression_preds_val)/3
preds_t['simple_average'] = (preds_t.L2Regression_preds_test + preds_t.RF_preds_test + preds_t.LinearRegression_preds_test)/3

In [17]:
error = wMSE(preds=preds_v.simple_average, y=y_val, weights=weights_val)
print(error*scale)

0.7495505543625007


In [18]:
preds_t['simple_average'].rename('y').to_csv('../data/output.csv', header=True)

In [None]:
# Learning curve
train_errors = []
test_errors = []
data_size = []
for N in range(1000,42000,5000):
    X_train_ = X_train.iloc[:N, :]
    y_train_ =  y_train.iloc[:N]
    weights_train_ = weights_train.iloc[:N]

    clf = RandomForestRegressor(n_estimators=100)
    clf.fit(X_train_, y_train_, sample_weight=weights_train_.values)
    
    preds_train = clf.predict(X_train_)
    preds_test = clf.predict(X_test)
    
    train_error = wMSE(preds=preds_train, y=y_train_, weights=weights_train_)
    test_error =  wMSE(preds=preds_test, y=y_test, weights=weights_test)
    
    print('Data size: {} Train error: {} Test error: {}'.format(N,train_error * 1e6, test_error*1e6))
    
    train_errors.append(train_error)
    test_errors.append(test_error)
    data_size.append(N)
    
plt.plot(data_size, train_errors)
plt.plot(data_size, test_errors)

In [None]:
# Sanity check
df_train_error = pd.DataFrame({'preds_train':preds_train, 'y_train':y_train, 'weights_train':weights_train})
df_train_error['Diff'] = df_train_error.preds_train - df_train_error.y_train
df_train_error['wMSE'] = np.square(df_train_error.Diff) * df_train_error.weights_train
print(df_train_error.wMSE.sum() / len(df_train_error))

## Feature importance

In [None]:
feat_importance = pd.Series(dict(zip(X_train.columns, clf.feature_importances_))).sort_values(ascending=False)
feat_importance[feat_importance > 1e-4].plot(kind='barh')

In [None]:
clf.get_params()

# Output 

In [None]:
preds_t['simple_average'].rename('y').to_csv('../data/output.csv', header=True)