In [3]:
#################################           Load libs                      #############################################
import os
import numpy as np
import pandas as pd
from keras import callbacks
import matplotlib.pyplot as plt
%matplotlib inline
import yfinance as yf
from ts_rnn.model import TS_RNN
from keras_tuner import HyperParameters
from ts_rnn.utils import metrics_eval, train_val_test_pred_plot
from sklearn.model_selection import TimeSeriesSplit
from copy import deepcopy
import json
import shutil

import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
tf.get_logger().setLevel('CRITICAL')

from sklearn.preprocessing import MinMaxScaler
from sktime.transformations.series.detrend import Deseasonalizer
from sktime.transformations.series.difference import Differencer
from sktime.transformations.series.outlier_detection import HampelFilter
from sktime.transformations.series.boxcox import BoxCoxTransformer, LogTransformer

2022-11-16 01:30:08.042677: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Process data

In [16]:
#Config
hp = HyperParameters()
rnn_arch = {"layers": [
                        ["LSTM", {"units": hp.Int(name='units',
                                                 min_value=2,
                                                 max_value=30,
                                                 step=10,
                                                 default=12
                                                ),
                                  "return_sequences": False,
                                  "kernel_initializer": "glorot_uniform",
                                  "activation": hp.Choice(name='LSTM_1_activation',
                                                          values=['relu', 'tanh', 'sigmoid', "linear"],
                                                          default='relu'),
                                  }],
                        ["Dropout", {"rate": hp.Float(name='dropout',
                                                      min_value=0.0,
                                                      max_value=0.5,
                                                      default=0.2,
                                                      step=0.05)
                                     }],
                        ["Dense", {"activation": "linear"}]
                    ]}

my_callbacks = [callbacks.EarlyStopping(patience=10, monitor='val_loss')]

CONFIG = {
    "TARGET": {'TICKERS': [
                           'YNDX.ME',
                           'SBER.ME',
                           'POLY.ME',
                           'SIBN.ME',
                           'AMZN',
                           'AAPL',
                           'GOOGL',
                           'NFLX'
                           ],
               'MIN_DATE': '2012-01-01',
               'MAX_DATE': '2022-01-01'},
    "VAL_LEN": 7,
    "TEST_LEN": 7,
    "CV_FOLDS": 5,
    "STRATEGIES": [
                    "Recursive",
                    "MiMo",
                    "Direct",
                    'DirRec',
                    "DirMo"
                ], # "Recursive", "MiMo", "Direct", 'DirRec', "DirMo"
    "MODEL": {'INIT':{
                    'rnn_arch': rnn_arch,
                    'tuner_hp': hp,
                    "n_lags": 30,
                    "horizon": 7,
                    "tuner": "BayesianOptimization", # "RandomSearch", "BayesianOptimization", "Hyperband"
                    "max_trials": 5,
                    "loss": 'mae',
                    "optimizer": 'adam'
                    },
                'FIT':{"epochs": 40,
                      "batch_size": 14,
                       'callbacks': my_callbacks}
              },
    "OUTLAYER_TRANSFORMERS": 'HampelFilter',
    "BASE_TRANSFORMERS": 'Differencer', # 'Differencer', 'LogTransformer', 'BoxCoxTransformer',
    "SEASON_TRANSFORMERS": None,
    "NORM_TRANSPORMERS": "MinMaxScaler",
    "TRANSFORMERS_ARGS": { 'HampelFilter':{'window_length': 10},
                                'MinMaxScaler': {"feature_range": (0, 1)},
                                'Differencer': {'lags': [1]},
                                'LogTransformer':{},
                                'BoxCoxTransformer': {},
                                'Deseasonalizer':{'sp': 364,
                                                  'model': 'multiplicative'},
                                }
}

tscv = TimeSeriesSplit(gap=0, n_splits=CONFIG["CV_FOLDS"], test_size=CONFIG["VAL_LEN"] +CONFIG["TEST_LEN"])
n_fits = CONFIG['MODEL']['INIT']['max_trials']  * CONFIG['CV_FOLDS'] * len(CONFIG['TARGET']['TICKERS']) * (len(CONFIG["STRATEGIES"]) + CONFIG["VAL_LEN"])
f"There will be {n_fits} fits"

'There will be 2400 fits'

## RNN training

In [12]:
def seve_fig_from_array(array, path, fig_name):
    plt.plot(array)
    plt.savefig(os.path.join(path, fig_name))
    plt.close()

In [14]:
# Import targets
targets_df = yf.download(CONFIG['TARGET']['TICKERS'],
                         start=CONFIG['TARGET']['MIN_DATE'],
                         end=CONFIG['TARGET']['MAX_DATE']
                        )['Adj Close']
if isinstance(targets_df, pd.Series):
    targets_df.name = CONFIG['TARGET']['TICKERS'][0]
    targets_df = targets_df.to_frame()
full_ind = pd.date_range(targets_df.index.min(), targets_df.index.max())
targets_df = targets_df.reindex(full_ind, fill_value=np.nan).interpolate()

# Make new experiment folder
if "reports" not in os.listdir('.'):
    os.makedirs("./reports")
new_folder_num = str(len(os.listdir("./reports")) + 1)
exp_folder = os.path.join("./reports", "strategy_experiment_" + new_folder_num)
os.mkdir(exp_folder)
print(f"Save experiment in {exp_folder}")

# Save config
CONFIG_ = deepcopy(CONFIG)
del CONFIG_['MODEL']['INIT']['tuner_hp']
del CONFIG_['MODEL']['FIT']['callbacks']
with open(os.path.join(exp_folder, "exp_config.json"), "w") as outfile:
    json.dump(CONFIG_, outfile, skipkeys=True)

models_dict = {}

for strategy in CONFIG["STRATEGIES"]:
    CONFIG['MODEL']['INIT']['strategy'] = strategy
    # Create model_folder
    model_name =  strategy
    print(f"Model {model_name}")
    model_folder = os.path.join(exp_folder, model_name)
    if model_name not in os.listdir(exp_folder):
        os.mkdir(model_folder)

    for ticker_name in CONFIG['TARGET']['TICKERS']:
        target = targets_df[ticker_name].dropna()
        series_folder = os.path.join(model_folder, ticker_name.split('.')[0])
        os.mkdir(series_folder)

        cv_val_metrics, cv_test_metrics = [], []
        cv_val_predictions, cv_test_predictions = [], []

        for train_index, val_index in tscv.split(target):
            # divide val to val and test
            test_index = val_index[CONFIG["VAL_LEN"]:]
            val_index = val_index[:CONFIG["VAL_LEN"]]

            iter_folder = f"TRAIN_{len(train_index)}_VAL_{len(val_index)}_TEST_{len(test_index)}"
            print(iter_folder)

            # Create folder for cv iteration
            iter_folder = os.path.join(series_folder, iter_folder)
            plots_folder = os.path.join(iter_folder, 'plots')
            os.mkdir(iter_folder)
            os.mkdir(plots_folder)
            plot_id = 0

            target_train, target_val, target_test = target[train_index], target[val_index], target[test_index]
            seve_fig_from_array(target_train, plots_folder, f'{plot_id}_target')
            plot_id+=1
            if CONFIG["OUTLAYER_TRANSFORMERS"] is not None:
                outlayer_transformer = eval(CONFIG["OUTLAYER_TRANSFORMERS"])(**CONFIG['TRANSFORMERS_ARGS'][CONFIG["OUTLAYER_TRANSFORMERS"]])
                target_train_tr = outlayer_transformer.fit_transform(target_train).interpolate(limit_direction="both")
                target_val_tr = target_val
                seve_fig_from_array(target_train_tr, plots_folder, f'{plot_id}_target_outlayers_removed')
                plot_id+=1
            else:
                target_train_tr = target_train
                target_val_tr = target_val

            if CONFIG["SEASON_TRANSFORMERS"] is not None:
                seasonal_transformer = eval(CONFIG["SEASON_TRANSFORMERS"])(**CONFIG['TRANSFORMERS_ARGS'][CONFIG["SEASON_TRANSFORMERS"]])
                target_train_tr = seasonal_transformer.fit_transform(target_train_tr)
                target_val_tr = seasonal_transformer.transform(target_val_tr)
                seve_fig_from_array(target_train_tr, plots_folder, f'{plot_id}_target_{CONFIG["SEASON_TRANSFORMERS"]}')
                plot_id+=1

            if CONFIG["BASE_TRANSFORMERS"] is not None:
                base_transformer = eval(CONFIG["BASE_TRANSFORMERS"])(**CONFIG['TRANSFORMERS_ARGS'][CONFIG["BASE_TRANSFORMERS"]])
                target_train_tr = base_transformer.fit_transform(target_train_tr).interpolate(limit_direction="both")
                target_val_tr = base_transformer.transform(target_val_tr).interpolate(limit_direction="both")
                seve_fig_from_array(target_train_tr, plots_folder, f'{plot_id}_target_{CONFIG["BASE_TRANSFORMERS"]}')
                plot_id+=1

            # Normalize target
            if CONFIG["NORM_TRANSPORMERS"] is not None:
                target_scaler = eval(CONFIG["NORM_TRANSPORMERS"])(**CONFIG['TRANSFORMERS_ARGS'][CONFIG["NORM_TRANSPORMERS"]])
                target_train_tr = target_scaler.fit_transform(target_train_tr.values.reshape(-1, 1))
                target_val_tr = target_scaler.transform(target_val_tr.values.reshape(-1, 1))

                target_train_tr = pd.Series(target_train_tr.flatten(), index=target_train.index, name=ticker_name)
                target_val_tr = pd.Series(target_val_tr.flatten(), index=target_val.index, name=ticker_name)

                seve_fig_from_array(pd.Series(target_train_tr, index=target_train.index), plots_folder, f'{plot_id}_target_scaled')
                plot_id+=1
            else:
                target_train_tr = target_train_tr
                target_val_tr = target_val_tr

            if strategy == 'DirMo':
                CONFIG["MODEL"]['INIT']['n_step_out'] = 3
            else:
                CONFIG["MODEL"]['INIT']['n_step_out'] = 1

            model = TS_RNN(save_dir=iter_folder, **CONFIG["MODEL"]['INIT'])
            model.logger.info(f'[Experiment] {model_name}')
            model.logger.info(f'[Experiment] {ticker_name}_TRAIN_{len(train_index)}_VAL_{len(val_index)}_TEST_{len(test_index)}')


            model.fit(target_train=target_train_tr,
                      target_val=target_val_tr,
                      **CONFIG["MODEL"]['FIT'],
                      verbose=1)
            
            # Удаляем логи тюнера
            shutil.rmtree(os.path.join(iter_folder, 'TS_RNN_tuner_log'))

            predicted_val = model.predict(target=target_train_tr.iloc[-model.n_lags:],
                                              prediction_len=CONFIG["VAL_LEN"])
            predicted_test = model.predict(target=pd.concat([target_train_tr, target_val_tr], axis=0).iloc[-model.n_lags:],
                                              prediction_len=CONFIG["VAL_LEN"])

            # inverse transform
            if CONFIG["NORM_TRANSPORMERS"] is not None:
                predicted_val = pd.Series(target_scaler.inverse_transform(predicted_val.reshape(-1, 1))
                                                          .flatten(),
                                             index=target_val.index)
                predicted_test = pd.Series(target_scaler.inverse_transform(predicted_test.reshape(-1, 1))
                                                          .flatten(),
                                             index=target_test.index)
            else:
                predicted_val = pd.Series(predicted_val, index=target_val.index)
                predicted_test = pd.Series(predicted_test, index=target_test.index)

            if CONFIG["BASE_TRANSFORMERS"] is not None:
                predicted_val = base_transformer.inverse_transform(predicted_val)
                if CONFIG["BASE_TRANSFORMERS"] == 'Differencer':
                    predicted_test = target_val[-1] + predicted_test.cumsum()
                else:
                    predicted_test = base_transformer.inverse_transform(predicted_test)

            if CONFIG["SEASON_TRANSFORMERS"] is not None:
                predicted_val = seasonal_transformer.inverse_transform(predicted_val)
                predicted_test = seasonal_transformer.inverse_transform(predicted_test)

            predicted_val.name = ticker_name
            predicted_val.to_csv(os.path.join(iter_folder, f'val_predictions.csv'), mode='w', sep=';')
            predicted_test.name = ticker_name
            predicted_test.to_csv(os.path.join(iter_folder, f'test_predictions.csv'), mode='w', sep=';')
            
            
            # Calculate metrics
            val_mertics = metrics_eval(target_val[:len(predicted_val)], predicted_val, save_dir=iter_folder, print_result=False, name="val_mertics")
            test_mertics = metrics_eval(target_test[:len(predicted_val)], predicted_test, save_dir=iter_folder, print_result=False, name='test_mertics')

            cv_val_metrics.append(val_mertics)
            cv_test_metrics.append(test_mertics)

            cv_val_predictions.append(predicted_val)
            cv_test_predictions.append(predicted_test)

            train_val_test_pred_plot(train=target_train,
                                     val=target_val,
                                     test=target_test,
                                     val_pred=predicted_val,
                                     test_pred=predicted_test,
                                     save_dir=plots_folder, show=False)

        models_dict[model_name] = {} if model_name not in models_dict else models_dict[model_name]
        models_dict[model_name]["mean_val_metrics"] = {} if "mean_val_metrics" not in models_dict[model_name] else models_dict[model_name]["mean_val_metrics"]
        models_dict[model_name]["mean_test_metrics"] = {} if "mean_test_metrics" not in models_dict[model_name] else models_dict[model_name]["mean_test_metrics"]
        models_dict[model_name]["mean_val_metrics"][ticker_name] = dict(pd.DataFrame(cv_val_metrics).mean())
        models_dict[model_name]["mean_test_metrics"][ticker_name] = dict(pd.DataFrame(cv_test_metrics).mean())

Trial 1 Complete [00h 00m 21s]
val_loss: 0.0623948760330677

Best val_loss So Far: 0.0623948760330677
Total elapsed time: 00h 00m 21s


## Arima training

In [None]:
import pmdarima as pm

# Create model_folder
model_name =  "Auto ARIMA"
model_folder = os.path.join(exp_folder, model_name)
os.mkdir(model_folder)

for ticker_name in CONFIG['TARGET']['TICKERS']:
    target = targets_df[ticker_name].dropna()
    series_folder = os.path.join(model_folder, ticker_name.split('.')[0])
    os.mkdir(series_folder)

    cv_val_metrics, cv_test_metrics = [], []
    cv_val_predictions, cv_test_predictions = [], []

    for train_index, val_index in tscv.split(target):
        # divide val to val and test
        test_index = val_index[CONFIG["VAL_LEN"]:]
        val_index = val_index[:CONFIG["VAL_LEN"]]
        iter_folder = f"TRAIN_{len(train_index)}_VAL_{len(val_index)}_TEST_{len(test_index)}"
        print(iter_folder)

        # Create folder for cv iteration
        iter_folder = os.path.join(series_folder, iter_folder)
        plots_folder = os.path.join(iter_folder, 'plots')
        os.mkdir(iter_folder)
        os.mkdir(plots_folder)
        plot_id = 0

        target_train, target_val, target_test = target[train_index], target[val_index], target[test_index]
        seve_fig_from_array(target_train, plots_folder, f'{plot_id}_target')
        plot_id+=1

        arima_model = pm.auto_arima(target_train,
                                   start_p=1, start_q=1,
                                   test='adf',
                                   max_p=3, max_q=3, m=12,
                                   start_P=0, seasonal=True,
                                   d=None, D=1, trace=True,
                                   error_action='ignore',
                                   suppress_warnings=True,
                                   stepwise=True)
        predicted_arima = arima_model.predict(n_periods=CONFIG["TEST_LEN"] * 2)
        predicted_val = predicted_arima[:CONFIG["TEST_LEN"]]
        predicted_test = predicted_arima[CONFIG["TEST_LEN"]:]

        predicted_val = pd.Series(predicted_val, index=target_val.index)
        predicted_test = pd.Series(predicted_test, index=target_test.index)

        # Calculate metrics
        val_mertics = metrics_eval(target_val[:len(predicted_val)], predicted_val, save_dir=iter_folder, print_result=False, name="val_mertics")
        test_mertics = metrics_eval(target_test[:len(predicted_val)], predicted_test, save_dir=iter_folder, print_result=False, name='test_mertics')

        cv_val_metrics.append(val_mertics)
        cv_test_metrics.append(test_mertics)

        cv_val_predictions.append(predicted_val)
        cv_test_predictions.append(predicted_test)

        train_val_test_pred_plot(train=target_train,
                                 val=target_val,
                                 test=target_test,
                                 val_pred=predicted_val,
                                 test_pred=predicted_test,
                                 save_dir=plots_folder, show=False)
    models_dict[model_name] = {} if model_name not in models_dict else models_dict[model_name]
    models_dict[model_name]["mean_val_metrics"] = {} if "mean_val_metrics" not in models_dict[model_name] else models_dict[model_name]["mean_val_metrics"]
    models_dict[model_name]["mean_test_metrics"] = {} if "mean_test_metrics" not in models_dict[model_name] else models_dict[model_name]["mean_test_metrics"]
    models_dict[model_name]["mean_val_metrics"][ticker_name] = dict(pd.DataFrame(cv_val_metrics).mean())
    models_dict[model_name]["mean_test_metrics"][ticker_name] = dict(pd.DataFrame(cv_test_metrics).mean())

In [82]:
with open(os.path.join(exp_folder, "exp_result.json"), "w") as outfile:
    json.dump(models_dict, outfile, skipkeys=True)

for mode_name in ['val', 'test']:
    exp_metrics = (pd.DataFrame({model_key: pd.DataFrame(models_dict[model_key][f'mean_{mode_name}_metrics']).mean(axis=1).to_dict() for model_key in models_dict.keys()}).round(2)
                   .round(2)
                   .sort_values(by='Mean absolute percentage error', axis=1)
                   .transpose()
                   )
    exp_metrics.to_csv(os.path.join(exp_folder, f'exp_{mode_name}_metrics.csv'), mode='w', sep=';')
    display(exp_metrics)

Unnamed: 0,Mean Absolute Error,Mean Squared Error,Symmetric Mean absolute percentage error,Root Mean Squared Error,Mean absolute percentage error
DirMo,32.28,3501.02,1.82,38.21,1.81
Direct,32.82,3685.85,1.84,38.94,1.83
DirRec,34.11,3933.93,1.93,40.81,1.91
Recursive,35.56,4415.34,1.96,42.8,1.95
MiMo,37.75,6353.49,1.96,44.53,1.95


Unnamed: 0,Mean Absolute Error,Mean Squared Error,Symmetric Mean absolute percentage error,Root Mean Squared Error,Mean absolute percentage error
MiMo,33.48,4727.58,2.11,41.19,2.09
DirMo,42.24,7956.5,2.24,50.38,2.21
Direct,42.78,7831.94,2.34,50.88,2.31
Recursive,46.15,9236.1,2.37,54.69,2.33
DirRec,46.51,10320.09,2.41,55.16,2.37
