<a href="https://colab.research.google.com/github/KoeusIss/orvp/blob/main/0x03_super_learner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Ensemble Learning
In statistics and machine learning, ensemble methods use multiple learning algorithms to obtain better predictive performance than could be obtained from any of the constituent learning algorithms alone. Unlike a statistical ensemble in statistical mechanics, which is usually infinite, a machine learning ensemble consists of only a concrete finite set of alternative models, but typically allows for much more flexible structure to exist among those alternatives.
In machine learnig we could find 3 most widely used ensembling technics:
* Bagging (Bootstrap aggregating)
* Boosting
* Stacking
### Bootstrap aggregating - Bagging
Bootstrap aggregating, often abbreviated as bagging, involves having each model in the ensemble vote with equal weight. *e.g*: Random forest algorithm combines random decision trees with bagging to achieve very high classification accuracy.
### Boosting
Boosting involves incrementally building an ensemble by training each new model instance to emphasize the training instances that previous models mis-classified. *e.g* Adaboost use a boosting technics of base estimators.
### Stacking
Stacking (sometimes called stacked generalization) involves training a learning algorithm to combine the predictions of several other learning algorithms. First, all of the other algorithms are trained using the available data, then a combiner algorithm is trained to make a final prediction using all the predictions of the other algorithms as additional inputs.

## Super learner
Super Learner is a targeted machine learning algorithm, It allows for utilization of ensemble machine learning algorithms in the fitting of probability distributions on observational data towards a target parameter defining a scientific question of interest.

In [5]:
# Connect with GoogleDrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import seaborn as sns

# Utiilities
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline

# models
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
import lightgbm as lgb

# Constant
BASE_PATH = "/content/drive/MyDrive/DS/optiver-realized-volatility-prediction/"
N_FOLDS = 10
N_REPEAT = 3

In [14]:
# Load our datasets
df_train = pd.read_csv(BASE_PATH + "df_train_v0.csv")
df_train

Unnamed: 0,stock_id,time_id,target,log_return_1_realized_volatility_s600,log_return_2_realized_volatility_s600,wap_1_mean_s600,wap_1_std_s600,wap_2_mean_s600,wap_2_std_s600,bid_price1_mean_s600,bid_price1_amax_s600,bid_price2_mean_s600,bid_price2_amax_s600,ask_price1_mean_s600,ask_price1_amin_s600,ask_price2_mean_s600,ask_price2_amin_s600,bid_size1_mean_s600,bid_size1_sum_s600,bid_size2_mean_s600,bid_size2_sum_s600,ask_size1_mean_s600,ask_size1_sum_s600,ask_size2_mean_s600,ask_size2_sum_s600,bid_spread_mean_s600,bid_spread_std_s600,ask_spread_mean_s600,ask_spread_std_s600,spread_price_2_mean_s600,spread_price_2_std_s600,wap_balance_mean_s600,wap_balance_std_s600,total_volume_mean_s600,total_volume_std_s600,log_return_1_realized_volatility_s540,log_return_2_realized_volatility_s540,wap_1_mean_s540,wap_1_std_s540,wap_2_mean_s540,...,size_sum_s360,amount_mean_s360,amount_sum_s360,order_count_mean_s360,order_count_sum_s360,log_return_realized_volatility_s300,size_mean_s300,size_sum_s300,amount_mean_s300,amount_sum_s300,order_count_mean_s300,order_count_sum_s300,log_return_realized_volatility_s240,size_mean_s240,size_sum_s240,amount_mean_s240,amount_sum_s240,order_count_mean_s240,order_count_sum_s240,log_return_realized_volatility_s180,size_mean_s180,size_sum_s180,amount_mean_s180,amount_sum_s180,order_count_mean_s180,order_count_sum_s180,log_return_realized_volatility_s120,size_mean_s120,size_sum_s120,amount_mean_s120,amount_sum_s120,order_count_mean_s120,order_count_sum_s120,log_return_realized_volatility_s60,size_mean_s60,size_sum_s60,amount_mean_s60,amount_sum_s60,order_count_mean_s60,order_count_sum_s60
0,0,5,0.004136,0.004499,0.006999,1.003725,0.000693,1.003661,0.000781,1.003314,1.004267,1.003138,1.004215,1.004169,1.002301,1.004320,1.002353,78.264901,23636,80.880795,24426,74.579470,22523,89.771523,27111,0.000176,0.000162,-0.000151,0.000126,0.000588,0.000107,0.000388,0.000295,323.496689,138.101214,0.004186,0.006635,1.003875,0.000449,1.003822,...,1796.0,72.099460,1802.4865,2.400000,60.0,0.001308,75.571429,1587.0,75.83908,1592.62070,2.571429,54.0,0.001271,79.200000,1584.0,79.480330,1589.60670,2.600000,52.0,0.001121,65.312500,1045.0,65.523590,1048.37740,2.437500,39.0,0.000993,67.181818,739.0,67.390650,741.297100,2.454545,27.0,0.000953,88.375000,707.0,88.646120,709.168950,2.750000,22.0
1,0,11,0.001445,0.001204,0.002476,1.000239,0.000262,1.000206,0.000272,1.000011,1.000627,0.999869,1.000477,1.000406,0.999975,1.000541,1.000176,149.965000,29993,95.445000,19089,71.145000,14229,94.895000,18979,0.000142,0.000148,-0.000135,0.000065,0.000335,0.000100,0.000212,0.000155,411.450000,172.263581,0.001165,0.002468,1.000250,0.000261,1.000220,...,1122.0,53.445446,1122.3544,2.047619,43.0,0.000587,56.250000,900.0,56.27037,900.32590,2.250000,36.0,0.000557,62.357143,873.0,62.380142,873.32196,2.142857,30.0,0.000501,82.800000,828.0,82.830830,828.30830,2.200000,22.0,0.000496,103.250000,826.0,103.288475,826.307800,2.500000,20.0,0.000203,69.333333,208.0,69.376970,208.130900,2.666667,8.0
2,0,16,0.002168,0.002369,0.004801,0.999542,0.000864,0.999680,0.000862,0.999204,1.000120,0.999007,0.999928,0.999929,0.997678,1.000127,0.997966,96.132979,18073,114.526596,21531,131.037234,24635,74.654255,14035,0.000197,0.000170,-0.000198,0.000171,0.000560,0.000147,0.000331,0.000246,416.351064,138.433034,0.002218,0.004183,0.999424,0.000892,0.999594,...,1482.0,98.648780,1479.7317,3.000000,45.0,0.001137,99.083333,1189.0,98.90628,1186.87540,3.166667,38.0,0.001048,108.700000,1087.0,108.492630,1084.92640,3.400000,34.0,0.001048,120.555556,1085.0,120.325264,1082.92740,3.666667,33.0,0.000820,97.285714,681.0,97.070890,679.496200,2.428571,17.0,0.000325,105.500000,211.0,105.253490,210.506970,2.500000,5.0
3,0,31,0.002195,0.002574,0.003637,0.998832,0.000757,0.998633,0.000656,0.998445,0.999815,0.998255,0.999769,0.999305,0.998520,0.999413,0.998566,114.458333,13735,68.783333,8254,120.800000,14496,131.225000,15747,0.000190,0.000199,-0.000108,0.000091,0.000579,0.000183,0.000380,0.000248,435.266667,156.120334,0.002569,0.003608,0.998711,0.000652,0.998536,...,1561.0,155.883770,1558.8376,4.700000,47.0,0.001089,172.888889,1556.0,172.64950,1553.84560,5.111111,46.0,0.001050,162.625000,1301.0,162.396600,1299.17290,3.875000,31.0,0.000802,171.333333,514.0,170.987270,512.96180,3.666667,11.0,0.000327,254.500000,509.0,253.985080,507.970150,5.000000,10.0,0.000327,254.500000,509.0,253.985080,507.970150,5.000000,10.0
4,0,62,0.001747,0.001894,0.003257,0.999619,0.000258,0.999626,0.000317,0.999407,0.999790,0.999216,0.999650,0.999804,0.999464,0.999913,0.999557,119.823864,21089,87.840909,15460,88.477273,15572,47.079545,8286,0.000191,0.000083,-0.000109,0.000076,0.000349,0.000093,0.000254,0.000188,343.221591,158.054066,0.001853,0.003176,0.999596,0.000246,0.999612,...,1452.0,111.641450,1451.3389,4.615385,60.0,0.000453,110.818182,1219.0,110.76564,1218.42200,4.909091,54.0,0.000442,101.555556,914.0,101.507890,913.57100,5.222222,47.0,0.000395,27.000000,162.0,26.988255,161.92952,3.666667,22.0,0.000360,10.750000,43.0,10.746719,42.986877,3.500000,14.0,0.000335,17.500000,35.0,17.495443,34.990887,5.500000,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,126,32751,0.003461,0.003691,0.005876,0.999582,0.000486,0.999585,0.000613,0.998995,1.000033,0.998904,0.999935,0.999873,0.998759,1.000075,0.999216,144.106452,44673,108.396774,33603,80.038710,24812,73.503226,22786,0.000091,0.000115,-0.000202,0.000164,0.000586,0.000114,0.000361,0.000273,406.045161,156.439152,0.003600,0.005639,0.999535,0.000491,0.999529,...,1234.0,53.598595,1232.7677,2.347826,54.0,0.001451,44.222222,796.0,44.17879,795.21826,2.055556,37.0,0.001109,33.666667,505.0,33.628360,504.42540,1.800000,27.0,0.001067,38.692308,503.0,38.648224,502.42690,1.923077,25.0,0.001011,17.666667,159.0,17.646582,158.819230,1.666667,15.0,0.000956,25.200000,126.0,25.170599,125.852990,1.800000,9.0
428928,126,32753,0.003113,0.004104,0.004991,1.002476,0.001264,1.002602,0.001303,1.002018,1.005101,1.001891,1.005067,1.002725,1.001137,1.002868,1.001172,93.650224,20884,91.623318,20432,29.605381,6602,28.443946,6343,0.000126,0.000132,-0.000142,0.000136,0.000487,0.000116,0.000295,0.000228,243.322870,175.885338,0.004041,0.004832,1.002612,0.001227,1.002736,...,1171.0,51.067810,1174.5597,3.347826,77.0,0.001791,55.350000,1107.0,55.52052,1110.41040,3.550000,71.0,0.001431,49.062500,785.0,49.224250,787.58800,2.812500,45.0,0.001388,25.076923,326.0,25.184832,327.40280,1.769231,23.0,0.001239,32.100000,321.0,32.238636,322.386350,1.900000,19.0,0.001218,45.833333,275.0,46.039043,276.234250,2.166667,13.0
428929,126,32758,0.004070,0.003117,0.006020,1.001082,0.000466,1.000996,0.000599,1.000457,1.001629,1.000268,1.001580,1.001196,1.000296,1.001388,1.000346,181.261719,46403,113.703125,29108,22.246094,5695,30.882812,7906,0.000189,0.000198,-0.000192,0.000118,0.000560,0.000129,0.000394,0.000307,348.093750,169.658223,0.003104,0.005776,1.001083,0.000484,1.001038,...,2759.0,110.441284,2761.0322,2.600000,65.0,0.001580,114.583333,2750.0,114.66776,2752.02640,2.541667,61.0,0.001490,100.833333,1815.0,100.901710,1816.23080,2.166667,39.0,0.001354,74.000000,888.0,74.066895,888.80273,2.166667,26.0,0.001157,43.333333,130.0,43.410442,130.231320,2.666667,8.0,0.000742,64.500000,129.0,64.614975,129.229950,3.500000,7.0
428930,126,32763,0.003357,0.003661,0.005362,1.001809,0.000456,1.001790,0.000507,1.001611,1.002667,1.001468,1.002535,1.002142,1.001021,1.002276,1.001284,86.626566,34564,85.614035,34160,144.418546,57623,109.756892,43793,0.000143,0.000116,-0.000134,0.000100,0.000403,0.000102,0.000231,0.000177,426.416040,166.294850,0.003407,0.004881,1.001839,0.000452,1.001823,...,6183.0,121.464800,6194.7050,2.941176,150.0,0.001520,119.767442,5150.0,119.99315,5159.70560,2.813953,121.0,0.001393,120.055556,4322.0,120.269820,4329.71340,2.833333,102.0,0.001057,119.480000,2987.0,119.686580,2992.16430,2.920000,73.0,0.000759,91.714286,1284.0,91.850240,1285.903400,2.714286,38.0,0.000438,94.571429,662.0,94.687790,662.814500,3.000000,21.0


In [15]:
# Lets take only 20% of our data
subsampled_df = df_train.sample(frac=.2)
subsampled_df.shape

(85786, 393)

In [16]:
X = subsampled_df.drop("target", axis=1)
y = subsampled_df["target"]

In [19]:
# Scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
y = np.asarray(y)

In this case, we will use eight different algorithms with finetuned configuration.

The get_models() function below defines all of the models and returns them as a list.

In [36]:
# create a list of base-models
def get_models():
	models = list()
	models.append(LinearRegression())
	models.append(ElasticNet(alpha=1.0410316294176341e-05,fit_intercept=False,l1_ratio=0.12336681374952599,max_iter=1500,tol=0.0007698356130986729))
	models.append(DecisionTreeRegressor(max_depth=7,min_samples_leaf=10,max_leaf_nodes=70,min_weight_fraction_leaf=0,max_features='auto',splitter='best'))
	models.append(MLPRegressor(alpha=7.630002490547322e-06,early_stopping=True,hidden_layer_sizes=16,tol=0.00011614929178984148,warm_start=True))
	models.append(lgb.LGBMRegressor(num_leaves=127,min_child_samples=445,learning_rate=0.02591748699123639,reg_alpha=0.0,reg_lambda=0.06601415645589061,subsample=0.5590865469154307,colsample_by_tree=0.7064428383863258,is_unbalance=False))
	models.append(BaggingRegressor(n_estimators=21))
	models.append(RandomForestRegressor(n_estimators=22))
	models.append(ExtraTreesRegressor(n_estimators=23))
	return models

In [37]:
get_models()

[LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 ElasticNet(alpha=1.0410316294176341e-05, copy_X=True, fit_intercept=False,
            l1_ratio=0.12336681374952599, max_iter=1500, normalize=False,
            positive=False, precompute=False, random_state=None,
            selection='cyclic', tol=0.0007698356130986729, warm_start=False),
 DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=7,
                       max_features='auto', max_leaf_nodes=70,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0, presort='deprecated',
                       random_state=None, splitter='best'),
 MLPRegressor(activation='relu', alpha=7.630002490547322e-06, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=16, learning_rate='constant',
 

In [38]:
# get OOF prediction
def get_out_of_fold_predictions(X, y, models):
    meta_X, meta_y = list(), list()
    kfold = KFold(n_splits=N_FOLDS, shuffle=True)

    for train_ix, test_ix in kfold.split(X):
        fold_yhats = list()
        
        train_X, test_X = X[train_ix], X[test_ix]
        train_y, test_y = y[train_ix], y[test_ix]
        meta_y.extend(test_y)

        for model in models:
            model.fit(train_X, train_y)
            yhat = model.predict(test_X)
            fold_yhats.append(yhat.reshape(len(yhat),1))

        meta_X.append(np.hstack(fold_yhats))
    return np.vstack(meta_X), np.asarray(meta_y)

In [39]:
# Fit the base model (first level model) with whole dataset
def fit_base_models(X, y, models):
    for model in models:
        model.fit(X, y)

In [40]:
# Fit the meta model (second level)
def fit_meta_model(X, y):
    model = LinearRegression()
    model.fit(X, y)
    return model

In [41]:
# Compute the Root Mean Squate Percentage Error
# An error functionn provided within the competition
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

In [42]:
def evaluate_models(X, y, models):
    for model in models:
        yhat = model.predict(X)
        _rmspe = rmspe(y, yhat)
        print('%s: RMSPE %.5f' % (model.__class__.__name__, _rmspe))

In [43]:
def super_learner_predictions(X, models, meta_model):
    meta_X = list()
    for model in models:
        yhat = model.predict(X)
        meta_X.append(yhat.reshape(len(yhat),1))
    meta_X = np.hstack(meta_X)

    return meta_model.predict(meta_X)

In [None]:
# Super"Learner" Call
fe, fe_test, label, label_test = train_test_split(X, y, test_size=.2)
models = get_models()

meta_X, meta_y = get_out_of_fold_predictions(fe, label, models)
fit_base_models(fe, label, models)
meta_model = fit_meta_model(meta_X, meta_y)
evaluate_models(fe_test, label_test, models)
yhat = super_learner_predictions(fe_test, models, meta_model)
print('Super Learner: RMSPE %.5f' % rmspe(label_test, yhat))