In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from helper_functions import *
import os

In [2]:
LR_data = pd.read_csv("./LR_Model_Predictions_(2016-2018).csv", low_memory=False, header=0)
LGBM_data = pd.read_csv("./LGBM_Predictions.csv", low_memory=False, header=0)
LSTM_data = pd.read_csv("./LSTM_predicted.csv", low_memory=False, header=0)
RF_data = pd.read_csv("./RF_predictions.csv", low_memory=False, header=0)
SARIMA_data = pd.read_csv("./SARIMA_Predictions.csv", low_memory=False, header=0)

In [3]:
# Standardize format
SARIMA_data.rename(columns=lambda x: x + "_SARIMA", inplace=True)
SARIMA_data.rename(columns={"Unnamed: 0_SARIMA":'DATE'}, inplace=True)
LGBM_data.rename(columns={"dates":'DATE'}, inplace=True)
LR_data["DATE"] = list(map(str, LR_data["DATE"]))
LR_data["DATE"] = LR_data["DATE"].apply(lambda x: x[0:4] + '-' + x[4:6] + '-' + x[6:8])

result = LGBM_data.set_index("DATE").join(LSTM_data.set_index("DATE"), on='DATE', lsuffix="_LGBM", rsuffix="_LSTM").reset_index()
result = result.set_index("DATE").join(RF_data.set_index("DATE"), on='DATE').reset_index()
result = result.set_index("DATE").join(LR_data.set_index("DATE"), on='DATE', lsuffix="_RF", rsuffix="_LR").reset_index()
result = result.set_index("DATE").join(SARIMA_data.set_index("DATE"), on='DATE').reset_index()

result = result.fillna(method='bfill')
result.head()

Unnamed: 0,DATE,F_AD_LGBM,F_AE_LGBM,F_AH_LGBM,F_AX_LGBM,F_BC_LGBM,F_BG_LGBM,F_BO_LGBM,F_BP_LGBM,F_C_LGBM,...,F_US_SARIMA,F_UZ_SARIMA,F_VF_SARIMA,F_VT_SARIMA,F_VW_SARIMA,F_VX_SARIMA,F_W_SARIMA,F_XX_SARIMA,F_YM_SARIMA,F_ZQ_SARIMA
0,2016-01-01,71296.911747,85901.674488,7829.242385,255484.549448,38365.514703,34013.656915,18215.556032,92581.499364,17829.692822,...,153750.0,111510.548466,128006.370234,147888.184302,112052.738638,18504.964587,23515.010887,30928.682762,86795.103351,414913.543865
1,2016-01-04,71296.911747,85901.674488,7829.242385,255484.549448,38365.514703,34013.656915,18215.556032,92581.499364,17829.692822,...,153750.0,111509.301163,128012.740785,147902.675392,112054.282462,18489.671922,23513.234782,30921.906748,86839.477543,414918.4413
2,2016-01-05,71296.911747,85901.674488,7829.242385,255484.549448,38365.514703,34013.656915,18215.556032,91997.11849,17829.692822,...,153750.0,111510.124515,128019.111653,147920.889259,112057.243954,18477.996683,23497.890472,30917.849244,86839.477543,414923.338792
3,2016-01-06,71050.233873,86663.42473,7829.242385,255484.549448,37032.812581,33906.859206,18056.948621,91401.078401,17792.318272,...,153750.0,111509.399341,128025.482838,147933.574103,112059.640625,18469.081644,23489.864373,30915.419424,86839.477543,414928.236342
4,2016-01-07,70580.112371,86786.8135,7829.242385,254367.179741,35499.680274,32161.040913,18000.390457,91797.971859,17730.697397,...,153750.0,111510.077885,128031.85434,147946.260034,112062.037346,18462.273359,23499.314896,30913.964274,86839.477543,414933.13395


In [4]:
all_x_train = result.loc[0:520,:]
all_x_test = result.loc[521:,:]

all_y_train = pd.read_pickle("data_base_test.pkl")
all_y_train = all_y_train.fillna(method='bfill')

all_y_test = pd.read_pickle("data_stack_test.pkl")
all_y_test = all_y_test.fillna(method='bfill')

In [5]:
futures = ['F_AD','F_AE','F_AH','F_AX','F_BC','F_BG','F_BO','F_BP',
               'F_C','F_CA','F_CC','F_CD','F_CF','F_CL','F_CT','F_DL',             
               'F_DM','F_DT','F_DX','F_DZ','F_EB','F_EC','F_ED','F_ES',
               'F_F','F_FB','F_FC','F_FL','F_FM','F_FP','F_FV','F_FY',             
               'F_GC','F_GD','F_GS','F_GX','F_HG','F_HO','F_HP','F_JY',
              'F_KC','F_LB','F_LC','F_LN','F_LQ','F_LR','F_LU','F_LX',
           'F_MD','F_MP','F_ND','F_NG','F_NQ','F_NR','F_NY','F_O',
             'F_OJ','F_PA','F_PL','F_PQ','F_RB','F_RF','F_RP','F_RR',
               'F_RU','F_RY','F_S','F_SB','F_SF','F_SH','F_SI','F_SM',
             'F_SS','F_SX','F_TR','F_TU','F_TY','F_UB','F_US','F_UZ',
            'F_VX','F_W','F_XX','F_YM','F_ZQ']

In [6]:
final_preds = pd.DataFrame(index=np.arange(len(all_x_test)), columns=np.arange(len(futures)))
final_preds.columns = futures
final_preds["DATE"] = all_x_test["DATE"].values
final_preds = final_preds.set_index("DATE")
final_preds.head()

Unnamed: 0_level_0,F_AD,F_AE,F_AH,F_AX,F_BC,F_BG,F_BO,F_BP,F_C,F_CA,...,F_TU,F_TY,F_UB,F_US,F_UZ,F_VX,F_W,F_XX,F_YM,F_ZQ
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,,,,,,,,,,,...,,,,,,,,,,
2018-01-02,,,,,,,,,,,...,,,,,,,,,,
2018-01-03,,,,,,,,,,,...,,,,,,,,,,
2018-01-04,,,,,,,,,,,...,,,,,,,,,,
2018-01-05,,,,,,,,,,,...,,,,,,,,,,


In [7]:
result = pd.DataFrame(columns=['LGBM', 'LSTM', 'RF', 'LR', 'SARIMA', 'Intercept', 'MAPE'])
for fut in futures:
    y_train = all_y_train[fut]['CLOSE']
    y_train = pd.DataFrame(y_train)
    y_train = y_train.fillna(method='ffill')
    y_test = all_y_test[fut]['CLOSE']
    y_test = pd.DataFrame(y_test)
    y_test = y_test.fillna(method='ffill')
    tempfut = fut + '_'
    feature_list = []
    for j in all_x_train.columns:
        if tempfut in j:
            feature_list.append(j)
    x_train = all_x_train.loc[:,feature_list]
    x_train = x_train.fillna(method='ffill')
    x_test = all_x_test.loc[:,feature_list]
    x_test = x_test.fillna(method='ffill')
    lr = LinearRegression()
    model = lr.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    #Store predictions 
    for i in range(len(final_preds.index.values)):
        final_preds.at[final_preds.index.values[i], fut] = y_pred[i][0]
    
    coefficient = lr.coef_
    intercept = lr.intercept_
    rmse = rmse_ratio(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print(coefficient)
    print(intercept)
    print(mape)
    result2 = pd.DataFrame(coefficient, columns=['LGBM', 'LSTM', 'RF', 'LR', 'SARIMA'])
    result2['Intercept'] = intercept
    result2['MAPE'] = mape.values
    result = result.append(result2, ignore_index=True)
    #print('MAPE for ', fut, ': ', mean_absolute_percentage_error(y_test, y_pred))  
    #print('RMSE for ', fut, ': ', rmse_ratio(y_test, y_pred))

  linalg.lstsq(X, y)


[[-0.03682807  0.05727466  0.23162801  0.82009922 -0.07818566]]
[492.6904608]
CLOSE    0.446288
dtype: float64
[[-0.039895    0.10673506  0.2270541   0.77047901 -0.06688123]]
[346.02242603]
CLOSE    0.820527
dtype: float64
[[ 0.01983935  0.06358075 -0.00695226  0.99530852 -0.04633033]]
[-202.28785557]
CLOSE    0.57776
dtype: float64
[[-0.18452443  0.10194569  0.16112307  1.01101421 -0.06999362]]
[-4852.24303431]
CLOSE    0.918579
dtype: float64
[[-0.097693    0.16900353  0.19261745  0.83705107 -0.09777513]]
[-46.76256557]
CLOSE    1.970718
dtype: float64
[[-0.14776151  0.09841957  0.08582999  1.03870417 -0.08004489]]
[268.50405487]
CLOSE    1.418103
dtype: float64
[[ 0.05472117  0.06701178  0.1123763   0.83803616 -0.07940823]]
[194.50553379]
CLOSE    0.939081
dtype: float64
[[-0.03343994  0.12351122 -0.20059081  1.00105424 -0.05918052]]
[15135.91472722]
CLOSE    0.484992
dtype: float64
[[0.00976731 0.03991059 0.10343122 0.82081614 0.01429468]]
[193.63993277]
CLOSE    0.833863
dtype: fl

In [8]:
final_preds.to_csv("Final_Predictions.csv", index=False)

In [9]:
all_x_test.head()

Unnamed: 0,DATE,F_AD_LGBM,F_AE_LGBM,F_AH_LGBM,F_AX_LGBM,F_BC_LGBM,F_BG_LGBM,F_BO_LGBM,F_BP_LGBM,F_C_LGBM,...,F_US_SARIMA,F_UZ_SARIMA,F_VF_SARIMA,F_VT_SARIMA,F_VW_SARIMA,F_VX_SARIMA,F_W_SARIMA,F_XX_SARIMA,F_YM_SARIMA,F_ZQ_SARIMA
521,2018-01-01,78333.293563,108571.222862,8869.279471,300757.296494,66456.907141,60124.632877,20154.452512,87035.878729,17730.697397,...,152829.056052,111965.47177,128042.74281,149521.873891,112275.664092,11709.052523,21329.00409,31327.815586,123495.950417,410570.44799
522,2018-01-02,78333.293563,108571.222862,8869.279471,300757.296494,66456.907141,60124.632877,20154.452512,87035.878729,17730.697397,...,152829.056052,111965.232796,128049.114691,149296.046776,112280.520332,11871.72635,21326.882213,31282.937254,123841.3635,410574.661099
523,2018-01-03,78333.293563,108571.222862,8869.279471,300757.296494,66456.907141,60124.632877,20154.452512,87035.878729,17730.697397,...,152829.056052,111967.366831,128055.486889,149179.860259,112282.974566,11999.133222,21360.10873,31255.354084,123841.3635,410578.874251
524,2018-01-04,78344.670161,109788.33242,8869.279471,300757.296494,67377.920132,61287.502726,20211.633309,87035.878729,17792.318272,...,152829.056052,111964.861935,128061.859404,149277.234753,112285.428853,12098.643035,21383.326464,31238.393334,123841.3635,410583.087447
525,2018-01-05,78657.706831,110512.910287,8869.279471,300757.296494,67433.674507,61366.069844,20307.890995,87035.878729,17437.864602,...,152160.276611,111947.884533,128066.380511,149470.539853,112282.004619,10948.925622,21700.0,31636.431942,124503.475643,410499.481935


In [10]:
result

Unnamed: 0,LGBM,LSTM,RF,LR,SARIMA,Intercept,MAPE
0,-0.036828,0.057275,0.231628,0.820099,-0.078186,492.690461,0.446288
1,-0.039895,0.106735,0.227054,0.770479,-0.066881,346.022426,0.820527
2,0.019839,0.063581,-0.006952,0.995309,-0.046330,-202.287856,0.577760
3,-0.184524,0.101946,0.161123,1.011014,-0.069994,-4852.243034,0.918579
4,-0.097693,0.169004,0.192617,0.837051,-0.097775,-46.762566,1.970718
5,-0.147762,0.098420,0.085830,1.038704,-0.080045,268.504055,1.418103
6,0.054721,0.067012,0.112376,0.838036,-0.079408,194.505534,0.939081
7,-0.033440,0.123511,-0.200591,1.001054,-0.059181,15135.914727,0.484992
8,0.009767,0.039911,0.103431,0.820816,0.014295,193.639933,0.833863
9,-0.090109,0.166385,0.280893,0.779700,-0.144999,277.647230,0.762742


In [11]:
result.to_csv("Stacked_Model_Coefficients.csv", index=False)