# Prediction of Sharpe ratio for blends of quantitative strategies

In [104]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor 

import tensorflow as tf
from keras import backend as K

## Data exploration

In [105]:
data = pd.read_csv("Data/Training_Input_2dx8C9Q.csv")
target = pd.read_csv("Data/Training_Output_IJhBXtA.csv")

In [106]:
test = pd.read_csv("Data/Testing_Input_dPKY3Rf.csv")

In [None]:
data.shape

In [None]:
target.shape

In [None]:
test.shape

In [107]:
data = data.set_index("ID")
target = target.set_index("ID")

In [108]:
test = test.set_index("ID")

In [None]:
data.head()

In [None]:
data.info()

In [53]:
target.head()

Unnamed: 0_level_0,Target
ID,Unnamed: 1_level_1
0,-12.007941
1,2.294867
2,0.652308
3,2.412364
4,8.517471


In [None]:
target.info()

In [None]:
# Missing values ?
print(True in data.isna(), True in target.isna(), True in test.isna())

In [None]:
# Let's see the columns to understand the construction of the dataset. 
# It begins with lag_20 and finishes with lag_0 for the 10 time series. 
# So lag_20 represent the fist day of a month set to a value of "100" of investment.
for i in data.columns:
    print(i)

In [None]:
# There is up to 50 different samples (different weights) for the same time serie of 26 days 
# (21 days of training and 5 days of prediction)
# given a train dataset of 10 000 samples, if there are exactly 50 differents, we should obtain :
# 10 000/50 = 200 unique (values) time series 

In [None]:
# I think any strategy I or macro feature X could allow to find the number of unique values,
# because for 20 different variations over the 21 days, only the same variations can give a same final value
# there is no place for hazard here. 

# For macro indicator X_1 (for example) we can find our same 200 time series starting at lag_18 (3rd day of the month)
for i in range(0,21):
    print(i,data["X_1_lag_"+str(i)].nunique())

In [None]:
# Let's check for the test dataset with I_7 (for example) 
for i in range(0,21):
    print(i,test["I_7_lag_"+str(i)].nunique())

## Creating the customized cross_val 

In [None]:
# A difficulty here is that the different samples for a same time serie are shuffled into the dataset. 
# So I can not just randomly split the dataset into a train/val datasets. 
# I have to make sure to have different time series in train and validation.
# So let's make our own cross_val. 

In [None]:
# First, let's check if there are same time series in train and test 
# We have 200 unique values for train, 89 for test, if indeed time series are different
# we should obtain 289 unique values in the concatenation oh both. 
check = pd.concat([data, test])
for i in range(0,21):
    print(i,check["I_7_lag_"+str(i)].nunique())

In [None]:
# we now know train and test are not shuffled and correspond to different time series.     

In [109]:
# Let's split train/val by values of X1_lag_0 (for example), as there are 200 unique values
values = data.X_1_lag_0.unique()
train, val = train_test_split(values, random_state = 2)

In [66]:
train.shape

(150,)

In [67]:
val.shape

(50,)

In [110]:
full_data = data.copy()

In [111]:
# I add the target to split its values the same way as for features. 
full_data["target"] = target.Target

In [112]:
# By filtering on train and val unique values, I can apply the split on the entire dataset
df_train = full_data[data.X_1_lag_0.isin(train)]
df_val = full_data[data.X_1_lag_0.isin(val)]

In [74]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7500 entries, 0 to 9999
Columns: 530 entries, weight_I_1 to target
dtypes: float64(530)
memory usage: 30.4 MB


In [76]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2500 entries, 2 to 9998
Columns: 530 entries, weight_I_1 to target
dtypes: float64(530)
memory usage: 10.1 MB


In [113]:
X_train = df_train.drop(["target"],axis=1)
y_train = df_train.loc[:,["target"]]

In [114]:
X_val = df_val.drop(["target"],axis=1)
y_val = df_val.loc[:,["target"]]

In [115]:
y_train["smoothed"] = np.sign(y_train["target"])*np.exp(-1/abs(y_train["target"]))
y_val["smoothed"] = np.sign(y_val["target"])*np.exp(-1/abs(y_val["target"]))

## Benchmark

In [None]:
# Let's recreate the benchmark score
# The benchmark is the average Sharpe ratio of the training period
# The metric is a L1 norm with values smoothed by a fonction : f(x) = sig(x)*exp(-1/abs(x))

#avg = y_train.target.mean()
avg = target.Target.mean()
X_train_benchmark = X_train.copy()
X_val_benchmark = X_val.copy()
X_train_benchmark["pred_benchmark"] = np.sign(avg)*np.exp(-1/abs(avg))
X_val_benchmark["pred_benchmark"] = np.sign(avg)*np.exp(-1/abs(avg))

In [None]:
benchmark_train_score = mean_absolute_error(y_train["smoothed"], X_train_benchmark["pred_benchmark"])
benchmark_val_score =  mean_absolute_error(y_val["smoothed"], X_val_benchmark["pred_benchmark"])
print("The average Sharpe ratio over the training set is :",avg)
print("Benchmark train score: {}, Benchmark validation score: {}".format(benchmark_train_score, benchmark_val_score))

## Decision Tree

In [None]:
dtr = DecisionTreeRegressor(criterion = "mae", max_depth=2)
dtr.fit(X_train, y_train.smoothed)

In [None]:
y_pred_train = dtr.predict(X_train)
y_pred_val = dtr.predict(X_val)

In [None]:
dtr_train_score = mean_absolute_error(y_train.smoothed, y_pred_train)
dtr_val_score = mean_absolute_error(y_val.smoothed, y_pred_val)
print(dtr_train_score, dtr_val_score)

In [None]:
def plot_feature_importances(model, Frame):
    plt.clf()
    n_features=len(Frame.columns)
    plt.figure(figsize=(10,100))
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), Frame.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    plt.ylim(-1, n_features)

## Random Forest

In [57]:
target["smoothed"] = np.sign(target["Target"])*np.exp(-1/abs(target["Target"]))

In [57]:
rfr = RandomForestRegressor(max_depth=2, criterion="mae", random_state=2)

In [58]:
%%time
rfr.fit(data_rf, target.smoothed)

CPU times: user 7min 11s, sys: 2.18 s, total: 7min 13s
Wall time: 7min 31s


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=2, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=2, verbose=0, warm_start=False)

In [None]:
rfr_pred_train = rfr_2.predict(X_train_norm)
rfr_pred_val = rfr_2.predict(X_val_norm)

In [75]:
rf_pred_train = rfr.predict(data_rf)
rf_pred = rfr.predict(test_rf)

In [None]:
print(mean_absolute_error(rfr_pred_train, y_train.smoothed))
print(mean_absolute_error(rfr_pred_val, y_val.smoothed))

In [None]:
X_train["pred"] = rfr_pred_train
X_val["pred"] = rfr_pred_val

In [76]:
data_rf["pred"] = rf_pred_train
test_rf["pred"] = rf_pred 

In [None]:
X_train["fpred"] = np.sign(X_train["pred"])*np.exp(-1/abs(X_train["pred"]))
X_val["fpred"] = np.sign(X_val["pred"])*np.exp(-1/abs(X_val["pred"]))

In [448]:
data_rf["fpred"] = -np.sign(data_rf['pred'])/(np.log(abs(data_rf['pred'])))
#data_rf["fpred"] = np.sign(data_rf['pred'])* np.exp(-1/abs(data_rf['pred']))
test_rf["fpred"] = -np.sign(test_rf['pred'])/(np.log(abs(test_rf['pred'])))

In [449]:
mean_absolute_error(target.smoothed, data_rf.fpred)

0.5792647390486836

In [72]:
test_rf.fpred.to_csv("submission_rf_pred_1")

In [None]:
print(mean_absolute_error(X_train.fpred, y_train.smoothed), mean_absolute_error(X_val.fpred, y_val.smoothed))
X_train = X_train.drop(["pred","fpred"], axis=1)
X_val = X_val.drop(["pred","fpred"], axis=1)

## XGBoost

In [96]:
xgb_2 = XGBRegressor()

In [97]:
xgb_2.fit(X_train_xgb, y_train.smoothed)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [98]:
xgb_pred_train = xgb_2.predict(X_train_xgb)
xgb_pred_val = xgb_2.predict(X_val_xgb)

In [99]:
X_train_xgb["pred"] = xgb_pred_train
X_val_xgb["pred"] = xgb_pred_val

In [100]:
print(mean_absolute_error(X_train_xgb.pred, y_train.smoothed))
print(mean_absolute_error(X_val_xgb.pred, y_val.smoothed))

0.35204926450015933
0.5303315033120306


In [102]:
X_train_xgb["fpred"] = np.sign(X_train_xgb["pred"])*np.exp(-1/abs(X_train_xgb["pred"]))
X_val_xgb["fpred"] = np.sign(X_val_xgb["pred"])*np.exp(-1/abs(X_val_xgb["pred"]))

In [103]:
print(mean_absolute_error(X_train_xgb.fpred, y_train.smoothed))
print(mean_absolute_error(X_val_xgb.fpred, y_val.smoothed))

0.5406433351653317
0.5806568335701102


In [58]:
xgb = XGBRegressor()
xgb.fit(data_rf, target.smoothed)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [60]:
xgb_pred = xgb.predict(data_rf)

In [63]:
data_rf["pred"] = xgb_pred
data_rf["fpred"] = np.sign(data_rf['pred'])* np.exp(-1/abs(data_rf['pred']))

In [64]:
print(mean_absolute_error(data_rf.pred, target.smoothed))
print(mean_absolute_error(data_rf.fpred, target.smoothed))

0.368835986301781
0.5425651188313596


## Linear Regressions

In [None]:
sc_x = StandardScaler()
X_train_norm = sc_x.fit_transform(X_train)
X_val_norm = sc_x.transform(X_val)

### ElasticNet

In [None]:
elastic_net = ElasticNet(alpha= 1, l1_ratio= 0)
elastic_net.fit(X_train, y_train.smoothed)

In [None]:
elast_pred_train = elastic_net.predict(X_train)
elast_pred_val = elastic_net.predict(X_val)

In [None]:
X_train["pred"] = elast_pred_train
X_val["pred"] = elast_pred_val

In [None]:
X_train["fpred"] = np.sign(X_train["pred"])*np.exp(-1/abs(X_train["pred"]))
X_val["fpred"] = np.sign(X_val["pred"])*np.exp(-1/abs(X_val["pred"]))

In [None]:
print(mean_absolute_error(X_train.fpred, y_train.smoothed), mean_absolute_error(X_val.fpred, y_val.smoothed))
X_train = X_train.drop(["pred","fpred"], axis=1)
X_val = X_val.drop(["pred","fpred"], axis=1)

In [None]:
params = {'alpha' : [0.01, 1, 5, 10], 'l1_ratio' : [0, 1]}
reg = ElasticNet()
grid = GridSearchCV(reg,param_grid=params, cv = 5)

grid.fit(X_train_norm, y_train.smoothed)

In [None]:
grid.best_params_

In [None]:
print(mean_absolute_error(grid.predict(X_train_norm), y_train.smoothed))
mean_absolute_error(grid.predict(X_val_norm), y_val.smoothed)

#### ElasticNet with feature selection pipeline and GridSearch

In [None]:
pipeline = Pipeline(
    [
        ('selector', SelectKBest(f_regression)),
        ('model',ElasticNet())
    ]
)

In [None]:
params_elasticnet = {'selector__k':[10,100,150,200,215], 'model__alpha' : [0.01, 1, 5, 10], 'model__l1_ratio' : [0, 1]}

In [None]:
search = GridSearchCV(
    estimator = pipeline,
    param_grid = params_elasticnet,
    scoring = 'neg_mean_absolute_error',
    cv = 5
)

In [None]:
search.fit(X_train_norm, y_train.smoothed)

In [None]:
search.best_params_

In [None]:
search.best_score_

In [None]:
print(mean_absolute_error(search.predict(X_train_norm), y_train.smoothed))
print(mean_absolute_error(search.predict(X_val_norm), y_val.smoothed))

## RNN 

### LSTM Univariate model / xgboost

In [None]:
X_train_I1 = X_train.loc[:,"I_1_lag_20":"I_1_lag_0"]
X_train_I2 = X_train.loc[:,"I_2_lag_20":"I_2_lag_0"]
X_train_I3 = X_train.loc[:,"I_3_lag_20":"I_3_lag_0"]
X_train_I4 = X_train.loc[:,"I_4_lag_20":"I_4_lag_0"]
X_train_I5 = X_train.loc[:,"I_5_lag_20":"I_5_lag_0"]
X_train_I6 = X_train.loc[:,"I_6_lag_20":"I_6_lag_0"]
X_train_I7 = X_train.loc[:,"I_7_lag_20":"I_7_lag_0"]

In [None]:
X_val_I1 = X_val.loc[:,"I_1_lag_20":"I_1_lag_0"]
X_val_I2 = X_val.loc[:,"I_2_lag_20":"I_2_lag_0"]
X_val_I3 = X_val.loc[:,"I_3_lag_20":"I_3_lag_0"]
X_val_I4 = X_val.loc[:,"I_4_lag_20":"I_4_lag_0"]
X_val_I5 = X_val.loc[:,"I_5_lag_20":"I_5_lag_0"]
X_val_I6 = X_val.loc[:,"I_6_lag_20":"I_6_lag_0"]
X_val_I7 = X_val.loc[:,"I_7_lag_20":"I_7_lag_0"]

In [None]:
X_test_I1 = test.loc[:,"I_1_lag_20":"I_1_lag_0"]
X_test_I2 = test.loc[:,"I_2_lag_20":"I_2_lag_0"]
X_test_I3 = test.loc[:,"I_3_lag_20":"I_3_lag_0"]
X_test_I4 = test.loc[:,"I_4_lag_20":"I_4_lag_0"]
X_test_I5 = test.loc[:,"I_5_lag_20":"I_5_lag_0"]
X_test_I6 = test.loc[:,"I_6_lag_20":"I_6_lag_0"]
X_test_I7 = test.loc[:,"I_7_lag_20":"I_7_lag_0"]

In [None]:
X_train_I1 = sc_x.fit_transform(X_train_I1)
X_train_I2 = sc_x.fit_transform(X_train_I2)
X_train_I3 = sc_x.fit_transform(X_train_I3)
X_train_I4 = sc_x.fit_transform(X_train_I4)
X_train_I5 = sc_x.fit_transform(X_train_I5)
X_train_I6 = sc_x.fit_transform(X_train_I6)
X_train_I7 = sc_x.fit_transform(X_train_I7)

In [None]:
X_val_I1 = sc_x.transform(X_val_I1)
X_val_I2 = sc_x.transform(X_val_I2)
X_val_I3 = sc_x.transform(X_val_I3)
X_val_I4 = sc_x.transform(X_val_I4)
X_val_I5 = sc_x.transform(X_val_I5)
X_val_I6 = sc_x.transform(X_val_I6)
X_val_I7 = sc_x.transform(X_val_I7)

In [None]:
X_test_I1 = sc_x.transform(X_test_I1)
X_test_I2 = sc_x.transform(X_test_I2)
X_test_I3 = sc_x.transform(X_test_I3)
X_test_I4 = sc_x.transform(X_test_I4)
X_test_I5 = sc_x.transform(X_test_I5)
X_test_I6 = sc_x.transform(X_test_I6)
X_test_I7 = sc_x.transform(X_test_I7)

In [None]:
X_train_I1 = np.expand_dims(X_train_I1, axis=-1)
X_train_I2 = np.expand_dims(X_train_I2, axis=-1)
X_train_I3 = np.expand_dims(X_train_I3, axis=-1)
X_train_I4 = np.expand_dims(X_train_I4, axis=-1)
X_train_I5 = np.expand_dims(X_train_I5, axis=-1)
X_train_I6 = np.expand_dims(X_train_I6, axis=-1)
X_train_I7 = np.expand_dims(X_train_I7, axis=-1)

In [None]:
X_val_I1 = np.expand_dims(X_val_I1, axis=-1)
X_val_I2 = np.expand_dims(X_val_I2, axis=-1)
X_val_I3 = np.expand_dims(X_val_I3, axis=-1)
X_val_I4 = np.expand_dims(X_val_I4, axis=-1)
X_val_I5 = np.expand_dims(X_val_I5, axis=-1)
X_val_I6 = np.expand_dims(X_val_I6, axis=-1)
X_val_I7 = np.expand_dims(X_val_I7, axis=-1)

In [None]:
X_test_I1 = np.expand_dims(X_test_I1, axis=-1)
X_test_I2 = np.expand_dims(X_test_I2, axis=-1)
X_test_I3 = np.expand_dims(X_test_I3, axis=-1)
X_test_I4 = np.expand_dims(X_test_I4, axis=-1)
X_test_I5 = np.expand_dims(X_test_I5, axis=-1)
X_test_I6 = np.expand_dims(X_test_I6, axis=-1)
X_test_I7 = np.expand_dims(X_test_I7, axis=-1)

In [None]:
BATCH_SIZE = 158
BUFFER_SIZE = 10000

In [None]:
train_univariate_1 = tf.data.Dataset.from_tensor_slices((X_train_I1, y_train.smoothed)).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()
train_univariate_2 = tf.data.Dataset.from_tensor_slices((X_train_I2, y_train.smoothed)).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()
train_univariate_3 = tf.data.Dataset.from_tensor_slices((X_train_I3, y_train.smoothed)).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()
train_univariate_4 = tf.data.Dataset.from_tensor_slices((X_train_I4, y_train.smoothed)).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()
train_univariate_5 = tf.data.Dataset.from_tensor_slices((X_train_I5, y_train.smoothed)).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()
train_univariate_6 = tf.data.Dataset.from_tensor_slices((X_train_I6, y_train.smoothed)).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()
train_univariate_7 = tf.data.Dataset.from_tensor_slices((X_train_I7, y_train.smoothed)).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()


In [None]:
val_univariate_1 = tf.data.Dataset.from_tensor_slices((X_val_I1, y_val.smoothed)).batch(BATCH_SIZE).repeat()
val_univariate_2 = tf.data.Dataset.from_tensor_slices((X_val_I2, y_val.smoothed)).batch(BATCH_SIZE).repeat()
val_univariate_3 = tf.data.Dataset.from_tensor_slices((X_val_I3, y_val.smoothed)).batch(BATCH_SIZE).repeat()
val_univariate_4 = tf.data.Dataset.from_tensor_slices((X_val_I4, y_val.smoothed)).batch(BATCH_SIZE).repeat()
val_univariate_5 = tf.data.Dataset.from_tensor_slices((X_val_I5, y_val.smoothed)).batch(BATCH_SIZE).repeat()
val_univariate_6 = tf.data.Dataset.from_tensor_slices((X_val_I6, y_val.smoothed)).batch(BATCH_SIZE).repeat()
val_univariate_7 = tf.data.Dataset.from_tensor_slices((X_val_I7, y_val.smoothed)).batch(BATCH_SIZE).repeat()

In [None]:
print(val_univariate_1.take(1))
y_train.smoothed.shape

In [None]:
def R2(y_true, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square(y_true - y_pred))
    SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
    return (1 - SS_res/(SS_tot + K.epsilon()))

In [None]:
simple_lstm_model_1 = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(64, input_shape=X_train_I1.shape[-2:]),
    tf.keras.layers.Dense(1)])
simple_lstm_model_2 = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(64, input_shape=X_train_I1.shape[-2:]),
    tf.keras.layers.Dense(1)])
simple_lstm_model_3 = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(64, input_shape=X_train_I1.shape[-2:]),
    tf.keras.layers.Dense(1)])
simple_lstm_model_4 = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(64, input_shape=X_train_I1.shape[-2:]),
    tf.keras.layers.Dense(1)])
simple_lstm_model_5 = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(64, input_shape=X_train_I1.shape[-2:]),
    tf.keras.layers.Dense(1)])
simple_lstm_model_6 = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(64, input_shape=X_train_I1.shape[-2:]),
    tf.keras.layers.Dense(1)])
simple_lstm_model_7 = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(64, input_shape=X_train_I1.shape[-2:]),
    tf.keras.layers.Dense(1)])

In [None]:
simple_lstm_model_1.compile(optimizer='adam', loss='mae', metrics=[R2])
simple_lstm_model_2.compile(optimizer='adam', loss='mae', metrics=[R2])
simple_lstm_model_3.compile(optimizer='adam', loss='mae', metrics=[R2])
simple_lstm_model_4.compile(optimizer='adam', loss='mae', metrics=[R2])
simple_lstm_model_5.compile(optimizer='adam', loss='mae', metrics=[R2])
simple_lstm_model_6.compile(optimizer='adam', loss='mae', metrics=[R2])
simple_lstm_model_7.compile(optimizer='adam', loss='mae', metrics=[R2])

In [None]:
for x, _ in val_univariate.take(1):
    print(simple_lstm_model.predict(x).shape)

In [None]:
EVALUATION_INTERVAL = 10
EPOCHS = 10

In [None]:
history_1 = simple_lstm_model_1.fit(train_univariate_1, epochs=EPOCHS,
                      steps_per_epoch=EVALUATION_INTERVAL,
                      validation_data=val_univariate_1, validation_steps=50)
print("-----------------------------------------------------------------------")
history_2 = simple_lstm_model_2.fit(train_univariate_2, epochs=EPOCHS,
                      steps_per_epoch=EVALUATION_INTERVAL,
                      validation_data=val_univariate_2, validation_steps=50)
print("-----------------------------------------------------------------------")
history_3 = simple_lstm_model_3.fit(train_univariate_3, epochs=EPOCHS,
                      steps_per_epoch=EVALUATION_INTERVAL,
                      validation_data=val_univariate_3, validation_steps=50)
print("-----------------------------------------------------------------------")
history_4 = simple_lstm_model_4.fit(train_univariate_4, epochs=EPOCHS,
                      steps_per_epoch=EVALUATION_INTERVAL,
                      validation_data=val_univariate_4, validation_steps=50)
print("-----------------------------------------------------------------------")
history_5 = simple_lstm_model_5.fit(train_univariate_5, epochs=EPOCHS,
                      steps_per_epoch=EVALUATION_INTERVAL,
                      validation_data=val_univariate_5, validation_steps=50)
print("-----------------------------------------------------------------------")
history_6 = simple_lstm_model_6.fit(train_univariate_6, epochs=EPOCHS,
                      steps_per_epoch=EVALUATION_INTERVAL,
                      validation_data=val_univariate_6, validation_steps=50)
print("-----------------------------------------------------------------------")
history_7 = simple_lstm_model_7.fit(train_univariate_7, epochs=EPOCHS,
                      steps_per_epoch=EVALUATION_INTERVAL,
                      validation_data=val_univariate_7, validation_steps=50)

In [None]:
plt.plot(history_2.history["loss"], color="b", label="Training Loss")
plt.plot(history_2.history["val_loss"], color="r", label="Validation Loss")
plt.ylabel("loss")
plt.xlabel("Epochs")
plt.legend()
plt.show()

In [None]:
plt.plot(history.history["R2"], color="b", label = "Training R2")
plt.plot(history.history["val_R2"], color="r", label ="Validation R2")
plt.ylabel("R2")
plt.xlabel("Epochs")
plt.legend()
plt.show()

In [None]:
# prediction on train values
lstm_pred_I1 = simple_lstm_model_1.predict(X_train_I1)
lstm_pred_I2 = simple_lstm_model_2.predict(X_train_I2)
lstm_pred_I3 = simple_lstm_model_3.predict(X_train_I3)
lstm_pred_I4 = simple_lstm_model_4.predict(X_train_I4)
lstm_pred_I5 = simple_lstm_model_5.predict(X_train_I5)
lstm_pred_I6 = simple_lstm_model_6.predict(X_train_I6)
lstm_pred_I7 = simple_lstm_model_7.predict(X_train_I7)

In [None]:
# prediction on validation values
lstm_val_pred_I1 = simple_lstm_model_1.predict(X_val_I1)
lstm_val_pred_I2 = simple_lstm_model_2.predict(X_val_I2)
lstm_val_pred_I3 = simple_lstm_model_3.predict(X_val_I3)
lstm_val_pred_I4 = simple_lstm_model_4.predict(X_val_I4)
lstm_val_pred_I5 = simple_lstm_model_5.predict(X_val_I5)
lstm_val_pred_I6 = simple_lstm_model_6.predict(X_val_I6)
lstm_val_pred_I7 = simple_lstm_model_7.predict(X_val_I7)

In [None]:
# prediction on test values
lstm_test_pred_I1 = simple_lstm_model_1.predict(X_test_I1)
lstm_test_pred_I2 = simple_lstm_model_2.predict(X_test_I2)
lstm_test_pred_I3 = simple_lstm_model_3.predict(X_test_I3)
lstm_test_pred_I4 = simple_lstm_model_4.predict(X_test_I4)
lstm_test_pred_I5 = simple_lstm_model_5.predict(X_test_I5)
lstm_test_pred_I6 = simple_lstm_model_6.predict(X_test_I6)
lstm_test_pred_I7 = simple_lstm_model_7.predict(X_test_I7)

In [None]:
d_train = {"lstm_pred_I1":lstm_pred_I1[:,0], "lstm_pred_I2":lstm_pred_I2[:,0], "lstm_pred_I3":lstm_pred_I3[:,0],
    "lstm_pred_I4":lstm_pred_I4[:,0], "lstm_pred_I5":lstm_pred_I5[:,0], "lstm_pred_I6":lstm_pred_I6[:,0], 
     "lstm_pred_I7":lstm_pred_I7[:,0]}
lstm_I_train = pd.DataFrame(data=d_train, index=X_train.index)

In [None]:
lstm_wI_train = pd.concat([X_train.iloc[:,0:7], lstm_I_train], axis=1)

In [None]:
d_val = {"lstm_pred_I1":lstm_val_pred_I1[:,0], "lstm_pred_I2":lstm_val_pred_I2[:,0], "lstm_pred_I3":lstm_val_pred_I3[:,0],
    "lstm_pred_I4":lstm_val_pred_I4[:,0], "lstm_pred_I5":lstm_val_pred_I5[:,0], "lstm_pred_I6":lstm_val_pred_I6[:,0], 
     "lstm_pred_I7":lstm_val_pred_I7[:,0]}
lstm_I_val = pd.DataFrame(data=d_val, index=X_val.index)

In [None]:
lstm_wI_val = pd.concat([X_val.iloc[:,0:7], lstm_I_val], axis=1)

In [None]:
lstm_wI_val

In [None]:
d_test = {"lstm_pred_I1":lstm_test_pred_I1[:,0], "lstm_pred_I2":lstm_test_pred_I2[:,0], "lstm_pred_I3":lstm_test_pred_I3[:,0],
    "lstm_pred_I4":lstm_test_pred_I4[:,0], "lstm_pred_I5":lstm_test_pred_I5[:,0], "lstm_pred_I6":lstm_test_pred_I6[:,0], 
     "lstm_pred_I7":lstm_test_pred_I7[:,0]}
lstm_I_test = pd.DataFrame(data=d_test, index=test.index)

In [None]:
lstm_wI_test = pd.concat([test.iloc[:,0:7], lstm_I_test], axis=1)

In [None]:
# Let's train an adaboost on the lstm predictions

In [None]:
adaboost = AdaBoostRegressor()
adaboost.fit(lstm_wI_train, y_train.smoothed)

In [None]:
ada_train_pred = adaboost.predict(lstm_wI_train)
ada_val_pred = adaboost.predict(lstm_wI_val)
ada_test_pred = adaboost.predict(lstm_wI_test)

In [None]:
lstm_wI_train["pred"] = ada_train_pred
lstm_wI_val["pred"] = ada_val_pred
lstm_wI_test["pred"] = ada_test_pred

In [None]:
lstm_wI_train["fpred"] = np.sign(lstm_wI_train["pred"])*np.exp(-1/abs(lstm_wI_train["pred"]))
lstm_wI_val["fpred"] = np.sign(lstm_wI_val["pred"])*np.exp(-1/abs(lstm_wI_val["pred"]))
lstm_wI_test["fpred"] = np.sign(lstm_wI_test["pred"])*np.exp(-1/abs(lstm_wI_test["pred"]))

In [None]:
print(mean_absolute_error(y_train.smoothed,lstm_wI_train["fpred"]))
print(mean_absolute_error(y_val.smoothed,lstm_wI_val["fpred"]))

Submission of the lstm/adaboost model on raw data

In [None]:
lstm_wI_test.fpred.to_csv("submission_lstm_adaboost")

## Feature Engineering 

### Functions

In [116]:
X_train = df_train.drop(["target"],axis=1)
y_train = df_train.loc[:,["target"]]

In [117]:
X_val = df_val.drop(["target"],axis=1)
y_val = df_val.loc[:,["target"]]

In [118]:
# Daily Rate of Change for each strategy 
def I_roc(dataset):
    I_roc_list = []
    for i in range(1,8):
        for j in range(20): 
            dataset["I_{}_roc_{}".format(i,j)] = np.log(dataset["I_{}_lag_{}".
                                                                  format(i,j)]/dataset["I_{}_lag_{}".format(i,j+1)])
            I_roc_list.append("I_{}_roc_{}".format(i,j))
    return I_roc_list

In [119]:
I_roc_list = I_roc(X_train)

In [120]:
# Monthly return for each strategy
def I_roc20(dataset):
    I_roc20_list = []
    for i in range(1,8):
        dataset["I_{}_roc20".format(i)] = 0
        I_roc20_list.append("I_{}_roc20".format(i))
        for j in range(20):
            dataset["I_{}_roc20".format(i)] += np.log(dataset["I_{}_lag_{}".
                                                                  format(i,j)]/dataset["I_{}_lag_{}".format(i,j+1)])
    return I_roc20_list

In [121]:
I_roc20_list = I_roc20(X_train)

In [122]:
# last week return for each strategy
def I_roc5(dataset):
    I_roc5_list = []
    for i in range(1,8):
        dataset["I_{}_roc5".format(i)] = 0
        I_roc5_list.append("I_{}_roc5".format(i))
        for j in range(5):
            dataset["I_{}_roc5".format(i)] += np.log(dataset["I_{}_lag_{}".
                                                                  format(i,j)]/dataset["I_{}_lag_{}".format(i,j+1)])
    return I_roc5_list

In [123]:
I_roc5_list = I_roc5(X_train)

In [124]:
# Daily Rate of Change for each macro-economic feature 
def X_roc(dataset):    
    X_roc_list = []
    for i in range(1,4):
        for j in range(20):
            dataset["X_{}_roc_{}".format(i,j)] = np.log(dataset["X_{}_lag_{}".
                                                                  format(i,j)]/dataset["X_{}_lag_{}".format(i,j+1)])
            X_roc_list.append("X_{}_roc_{}".format(i,j))
    return X_roc_list

In [125]:
X_roc_list = X_roc(X_train)

In [126]:
# Monthly return for each macro-economic feature
def X_roc20(dataset):
    X_roc20_list = []
    for i in range(1,4):
        dataset["X_{}_roc20".format(i)] = 0
        X_roc20_list.append("X_{}_roc20".format(i))
        for j in range(20):
            dataset["X_{}_roc20".format(i)] += np.log(dataset["X_{}_lag_{}".
                                                              format(i,j)]/dataset["X_{}_lag_{}".format(i,j+1)])
    return X_roc20_list

In [127]:
X_roc20_list = X_roc20(X_train)

In [128]:
# last week return for each macro-economic feature
def X_roc5(dataset):
    X_roc5_list = []
    for i in range(1,4):
        dataset["X_{}_roc5".format(i)] = 0
        X_roc5_list.append("X_{}_roc5".format(i))
        for j in range(5):
            dataset["X_{}_roc5".format(i)] += np.log(dataset["X_{}_lag_{}".
                                                                  format(i,j)]/dataset["X_{}_lag_{}".format(i,j+1)])
    return X_roc5_list

In [129]:
X_roc5_list = X_roc5(X_train)

In [130]:
# Weekly rate of change shifted with a window step = 5 
def I_roc5_shifted(dataset):
    I_roc5_shifted_list = []
    for s in range(1,4):
        for i in range(1,8):
            dataset["I_{}_roc5_S{}".format(i,s)] = 0
            I_roc5_shifted_list.append("I_{}_roc5_S{}".format(i,s))
            for j in range(5):
                dataset["I_{}_roc5_S{}".format(i,s)] += np.log(dataset["I_{}_lag_{}".
                                                     format(i,j+(5*s))]/dataset["I_{}_lag_{}".format(i,j+(5*s)+1)])
    return I_roc5_shifted_list

In [131]:
I_roc5_shifted_list = I_roc5_shifted(X_train)

In [132]:
# last week weighted return 
def I_wr(dataset):
    dataset["I_wr"] = 0
    for i in range(1,8):
        for j in range(5):
            dataset["I_wr"] += (np.log(dataset["I_{}_lag_{}".
                                                         format(i,j)]/dataset["I_{}_lag_{}".format(i,j+1)])
            * dataset["weight_I_{}".format(i)]) * 252/5
    return ["I_wr"]

In [133]:
I_wr_list = I_wr(X_train)

In [134]:
# Weighted rate of return for each shifted window
# This function need "I_roc5_shifted" appended to the dataset first
def I_wr_shifted(dataset):
    I_wr_shifted_list = []
    for s in range(1,4):
        dataset["I_wr_S{}".format(s)] = 0
        I_wr_shifted_list.append("I_wr_S{}".format(s))
        for i in range(1,8):
            dataset["I_wr_S{}".format(s)] += (dataset["I_{}_roc5_S{}".format(i,s)] * 
                                              dataset["weight_I_{}".format(i)] * (252/5))
    return I_wr_shifted_list

In [135]:
I_wr_shifted_list = I_wr_shifted(X_train)

In [136]:
# Cov of strategies I
# This function need "I_roc" and "I_roc20" appended to the dataset first  
def I_cov(dataset):
    I_cov_list = []
    for i in range(1,8):
        for j in range(1,8):
            dataset["I_cov_{}{}".format(i,j)] = 0
            I_cov_list.append("I_cov_{}{}".format(i,j))
            for t in range(20):
                dataset["I_cov_{}{}".format(i,j)] += ((dataset["I_{}_roc_{}".format(i,t)]
                                                      - dataset["I_{}_roc20".format(i)])
                                                     * (dataset["I_{}_roc_{}".format(j,t)] 
                                                     - dataset["I_{}_roc20".format(j)]))
    return I_cov_list

In [137]:
I_cov_list = I_cov(X_train)

In [138]:
# Cov of X 
# This function need "X_roc" and "X_roc20" appended to the dataset first  
def X_cov(dataset):
    X_cov_list = []
    for i in range(1,4):
        for j in range(1,4):
            dataset["X_cov_{}{}".format(i,j)] = 0
            X_cov_list.append("X_cov_{}{}".format(i,j))
            for t in range(20):
                dataset["X_cov_{}{}".format(i,j)] += ((dataset["X_{}_roc_{}".format(i,t)]
                                                      - dataset["X_{}_roc20".format(i)])
                                                    * (dataset["X_{}_roc_{}".format(j,t)] 
                                                    - dataset["X_{}_roc20".format(j)]))
    return X_cov_list

In [139]:
X_cov_list = X_cov(X_train)

In [140]:
# Volatility of blended strategies
# This function need "I_cov" appended to the dataset first
def sigma(dataset):
    dataset["sigma"] = 0
    for i in range(1,8):
        for j in range(1,8):
            dataset["sigma"] += (252*dataset['weight_I_{}'.format(i)]*dataset['weight_I_{}'.format(j)]
                                * dataset["I_cov_{}{}".format(i,j)])
    dataset["sigma"] = np.sqrt(dataset["sigma"])
    dataset["sigma"][dataset["sigma"]<0.005] = 0.005
    return ["sigma"]

In [141]:
sigma_list = sigma(X_train)

In [142]:
# Sharpe ratio of the portfolio (blended strategies)
# This function need "sigma" and "I_wr" appended to the dataset first
def SR(dataset):
    dataset["SR"] = dataset["I_wr"]/dataset["sigma"]
    return ["SR"]

In [143]:
SR_list = SR(X_train)

In [144]:
# Shifted Sharpe ratios of the portfolio 
# This function need "sigma" and "I_wr_shifted" appended to the dataset first 
def SR_shifted(dataset):
    SR_shifted_list = []
    for s in range(1,4):
        dataset["SR_S{}".format(s)] = dataset["I_wr_S{}".format(s)]/dataset["sigma"]
        SR_shifted_list.append("SR_S{}".format(s))
    return SR_shifted_list

In [145]:
SR_shifted_list = SR_shifted(X_train)

In [146]:
# Sharpe raio of each strategy alone
# This function need "I_cov" and "I_roc5" appended to the dataset first 
def SR_I(dataset):
    SR_I_list = []
    for i in range(1,8):
        dataset["SR_I{}".format(i)] = np.sqrt(dataset["I_cov_{}{}".format(i,i)])
        dataset["SR_I{}".format(i)][dataset["SR_I{}".format(i)]<0.005] = 0.005 
        dataset["SR_I{}".format(i)] = dataset["I_{}_roc5".format(i)] / dataset["SR_I{}".format(i)]
        SR_I_list.append("SR_I{}".format(i))
    return SR_I_list

In [147]:
SR_I_list = SR_I(X_train)

### Selection

In [148]:
X_train = df_train.drop(["target"],axis=1)
y_train = df_train.loc[:,["target"]]

In [149]:
X_val = df_val.drop(["target"],axis=1)
y_val = df_val.loc[:,["target"]]

In [52]:
# list of indicators operations to append to the dataset
indicators = [I_roc, I_roc20, I_roc5, I_roc5_shifted, I_cov, SR_I, X_roc, X_roc20, sigma, I_wr_shifted, SR_shifted, X_cov, I_wr, SR]


In [53]:
def engineering(dataset, indicators):
    ''' Append the indicators to the dataset 
    '''
    for indicator in indicators:
        indicator(dataset)

In [92]:
engineering(X_train, indicators)
engineering(X_val, indicators)

In [93]:
engineering(data, indicators)
engineering(test, indicators)

In [55]:
# list of indicators list to keep
selection = SR_I_list+I_roc20_list+X_roc20_list+SR_shifted_list+X_cov_list+SR_list

In [95]:
X_train_xgb = X_train[selection]
X_val_xgb = X_val[selection]

In [56]:
data_rf = data[selection]
test_rf = test[selection]