In [1]:
# Install libs
!pip install ts-rnn yfinance sktime pmdarima

In [1]:
import os
import numpy as np
import pandas as pd
import yfinance as yf
from keras_tuner import HyperParameters
from tensorflow.keras import callbacks
import matplotlib.pyplot as plt
from copy import deepcopy
import shutil
import json
import time

from ts_rnn.model import TS_RNN
from ts_rnn.feature_selection import feature_importance
from ts_rnn.utils import metrics_eval, train_val_test_pred_plot

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split

from sktime.transformations.series.detrend import Deseasonalizer
from sktime.transformations.series.difference import Differencer
from sktime.transformations.series.outlier_detection import HampelFilter
from sktime.transformations.series.boxcox import BoxCoxTransformer, LogTransformer

2022-11-16 21:08:01.903158: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Config experiment

In [4]:
hp = HyperParameters()
rnn_arch = {"layers": [
                        ["LSTM", {"units": hp.Int(name='units',
                                                 min_value=2,
                                                 max_value=40,
                                                 step=10,
                                                 default=12
                                                ),
                                  "return_sequences": True,
                                  "kernel_initializer": "glorot_uniform",
                                  "activation": hp.Choice(name='LSTM_1_activation',
                                                          values=['relu', 'tanh', 'sigmoid', "linear"],
                                                          default='relu'),
                                  }],
                        ["Dropout", {"rate": hp.Float(name='dropout',
                                                      min_value=0.0,
                                                      max_value=0.5,
                                                      default=0.2,
                                                      step=0.05)
                                     }],
                        ["LSTM", {"units": hp.Int(name='units',
                                                 min_value=2,
                                                 max_value=40,
                                                 step=10,
                                                 default=12
                                                ),
                                  "return_sequences": False,
                                  "kernel_initializer": "glorot_uniform",
                                  "activation": hp.Choice(name='LSTM_1_activation',
                                                          values=['relu', 'tanh', 'sigmoid', "linear"],
                                                          default='relu'),
                                  }],
                        ["Dense", {"activation": "linear"}]
                    ]}

my_callbacks = [callbacks.EarlyStopping(patience=10, monitor='val_loss')]

CONFIG = {
    "TARGET": {'TICKERS': [
                           'YNDX.ME',
                           'SBER.ME',
                           'POLY.ME',
                           'SIBN.ME',
                           'AMZN',
                           'AAPL',
                           'GOOGL',
                           'NFLX'
                           ],
               'MIN_DATE': '2012-01-01',
               'MAX_DATE': '2022-01-01'},
    'FACTORS': {'TICKERS': [
                            'USDRUB=X',
                            'EURRUB=X',
                            'BZ=F', # Brent
                            'GC=F', # Gold
                            '^GSPC', # S&P 500
                            '^IXIC', # NASDAQ
                            '^DJI', # Dow Jones
                            ],
                'MIN_DATE': '2012-01-01',
                'MAX_DATE': '2022-01-01'
    },

    "VAL_LEN": 7,
    "TEST_LEN": 7,
    'CV_FOLDS': 5,
    "MODEL": {'INIT': {
                    'rnn_arch': rnn_arch,
                    'tuner_hp': hp,
                    "strategy": "MiMo", # "Direct", "Recursive", "MiMo"
                    "n_lags": 30,
                    "horizon": 7,
                    "tuner": "BayesianOptimization", # "RandomSearch", "BayesianOptimization", "Hyperband"
                    "max_trials": 10,
                    "loss": 'mae',
                    "optimizer": 'adam'
                    },
                'FIT':{"epochs": 100,
                      "batch_size": 14,
                       'callbacks': my_callbacks}
              },
    "OUTLAYER_TRANSFORMERS": 'HampelFilter',
    "SEASON_TRANSFORMERS": None,
    "BASE_TRANSFORMERS": 'Differencer', # 'Differencer', 'LogTransformer', 'BoxCoxTransformer',
    "NORM_TRANSPORMERS": "MinMaxScaler",
    "TRANSFORMERS_ARGS": { 'HampelFilter':{'window_length': 10},
                                'MinMaxScaler': {"feature_range": (0, 1)},
                                'Differencer': {'lags': [1]},
                                'LogTransformer':{},
                                'BoxCoxTransformer': {},
                                'Deseasonalizer':{'sp': 364,
                                                  'model': 'multiplicative'},
                          },
    'FEATURE_SELECTION': {'ratio':0.3, 'metric':"mae", 'max_iter':100}
}

tscv = TimeSeriesSplit(gap=0, n_splits=CONFIG["CV_FOLDS"], test_size=CONFIG["VAL_LEN"] +CONFIG["TEST_LEN"])

In [5]:
def seve_fig_from_array(array, path, fig_name):
    plt.plot(array)
    plt.savefig(os.path.join(path, fig_name))
    plt.close()

## RNN training

In [None]:
# Import targets
targets_df = yf.download(CONFIG['TARGET']['TICKERS'],
                         start=CONFIG['TARGET']['MIN_DATE'],
                         end=CONFIG['TARGET']['MAX_DATE']
                        )['Adj Close']
if isinstance(targets_df, pd.Series):
    targets_df.name = CONFIG['TARGET']['TICKERS'][0]
    targets_df = targets_df.to_frame()
full_ind = pd.date_range(targets_df.index.min(), targets_df.index.max())
targets_df = targets_df.reindex(full_ind, fill_value=np.nan).interpolate()

# Import targets
if CONFIG['FACTORS']['TICKERS'] != []:
    factors_df = yf.download(CONFIG['FACTORS']['TICKERS'],
                             start=CONFIG['FACTORS']['MIN_DATE'],
                             end=CONFIG['FACTORS']['MAX_DATE'])['Adj Close']
    if isinstance(factors_df, pd.Series):
        factors_df.name = CONFIG['FACTORS']['TICKERS'][0]
        factors_df = factors_df.to_frame()
    factors_df = factors_df.reindex(full_ind, fill_value=np.nan).interpolate()
both_index = pd.concat([targets_df, factors_df], axis=1).dropna().index
targets_df = targets_df.loc[both_index]
factors_df = factors_df.loc[both_index]

# Make new experiment folder
if "reports" not in os.listdir('.'):
    os.makedirs("./reports")
new_folder_num = str(len(os.listdir("./reports")) + 1)
exp_folder = os.path.join("./reports", "factors_experiment_" + new_folder_num)
os.mkdir(exp_folder)
print(f"Save experiment in {exp_folder}")

# Save config
CONFIG_ = deepcopy(CONFIG)
del CONFIG_['MODEL']['INIT']['tuner_hp']
del CONFIG_['MODEL']['FIT']['callbacks']
with open(os.path.join(exp_folder, "exp_config.json"), "w") as outfile:
    json.dump(CONFIG_, outfile, skipkeys=True)


models_dict = {}

for model_type in ['GLOBAL']: # 'LOCAL', 'GLOBAL'
    for use_factors in [True, False]:
        # Create model_folder
        model_name = model_type + "_WITH_FACTORS" if use_factors else model_type + "_WITHOUT_FACTORS"
        print(f"Model {model_name}")
        model_folder = os.path.join(exp_folder, model_name)
        os.mkdir(model_folder)

        iter_list = CONFIG['TARGET']['TICKERS'] if model_type=='LOCAL' else [CONFIG['TARGET']['TICKERS']]

        for ticker_name in iter_list:
            target = targets_df[ticker_name].dropna()
            if isinstance(target, pd.Series):
                target = target.to_frame()
            series_folder = os.path.join(model_folder, ticker_name.split('.')[0]) if model_type == 'LOCAL' else os.path.join(model_folder,'GLOBAL')
            os.mkdir(series_folder)

            cv_val_metrics, cv_test_metrics = [], []
            cv_val_predictions, cv_test_predictions = [], []

            for train_index, val_index in tscv.split(target):
                start_time = time.time()

                # divide val to val and test
                test_index = val_index[CONFIG["VAL_LEN"]:]
                val_index = val_index[:CONFIG["VAL_LEN"]]

                iter_folder = f"TRAIN_{len(train_index)}_VAL_{len(val_index)}_TEST_{len(test_index)}"

                # Create folder for cv iteration
                iter_folder = os.path.join(series_folder, iter_folder)

                os.mkdir(iter_folder)

                target_train, target_val, target_test = target.iloc[train_index], target.iloc[val_index], target.iloc[test_index]


                target_train_tr_full = None
                target_val_tr_full = None

                processers_dict = {}
                for target_name in target_train.columns:
                    target_folder = os.path.join(iter_folder, target_name.split('.')[0])
                    os.mkdir(target_folder)
                    plots_folder = os.path.join(target_folder, 'plots')
                    os.mkdir(plots_folder)
                    plot_id = 0

                    target_train_smpl = target_train[target_name]
                    target_val_smpl = target_val[target_name]

                    seve_fig_from_array(target_train_smpl, plots_folder, f'{plot_id}_target_{target_name.split(".")[0]}')
                    plot_id += 1
                    
                    processers_dict[target_name] = {}
                    if CONFIG["OUTLAYER_TRANSFORMERS"] is not None:
                        outlayer_transformer = eval(CONFIG["OUTLAYER_TRANSFORMERS"])(
                            **CONFIG['TRANSFORMERS_ARGS'][CONFIG["OUTLAYER_TRANSFORMERS"]])
                        target_train_tr = outlayer_transformer.fit_transform(target_train_smpl).interpolate(
                            limit_direction="both")
                        target_val_tr = target_val_smpl
                        seve_fig_from_array(target_train_tr, plots_folder, f'{plot_id}_target_outlayers_removed_{target_name.split(".")[0]}')
                        plot_id += 1
                    else:
                        target_train_tr = target_train_smpl
                        target_val_tr = target_val_smpl

                    if CONFIG["SEASON_TRANSFORMERS"] is not None:
                        seasonal_transformer = eval(CONFIG["SEASON_TRANSFORMERS"])(
                            **CONFIG['TRANSFORMERS_ARGS'][CONFIG["SEASON_TRANSFORMERS"]])
                        target_train_tr = seasonal_transformer.fit_transform(target_train_tr)
                        target_val_tr = seasonal_transformer.transform(target_val_tr)
                        processers_dict[target_name]['SEASON_TRANSFORMERS'] = seasonal_transformer

                        seve_fig_from_array(target_train_tr, plots_folder,
                                            f'{plot_id}_target_{CONFIG["SEASON_TRANSFORMERS"]}_{target_name.split(".")[0]}')
                        plot_id += 1

                    if CONFIG["BASE_TRANSFORMERS"] is not None:
                        base_transformer = eval(CONFIG["BASE_TRANSFORMERS"])(
                            **CONFIG['TRANSFORMERS_ARGS'][CONFIG["BASE_TRANSFORMERS"]])
                        target_train_tr = base_transformer.fit_transform(target_train_tr).interpolate(
                            limit_direction="both")
                        target_val_tr = base_transformer.transform(target_val_tr).interpolate(limit_direction="both")
                        processers_dict[target_name]['BASE_TRANSFORMERS'] = base_transformer

                        seve_fig_from_array(target_train_tr, plots_folder,
                                            f'{plot_id}_target_{CONFIG["BASE_TRANSFORMERS"]}_{target_name.split(".")[0]}')
                        plot_id += 1

                    # Normalize target
                    if CONFIG["NORM_TRANSPORMERS"] is not None:
                        target_scaler = eval(CONFIG["NORM_TRANSPORMERS"])(**CONFIG['TRANSFORMERS_ARGS'][CONFIG["NORM_TRANSPORMERS"]])
                        target_train_tr = target_scaler.fit_transform(target_train_tr.values.reshape(-1, 1))
                        target_val_tr = target_scaler.transform(target_val_tr.values.reshape(-1, 1))
                        target_train_tr = pd.Series(target_train_tr.flatten(), index=target_train.index, name=target_name)
                        target_val_tr = pd.Series(target_val_tr.flatten(), index=target_val.index, name=target_name)

                        processers_dict[target_name]['NORM_TRANSPORMERS'] = target_scaler

                        seve_fig_from_array(pd.Series(target_train_tr, index=target_train.index), plots_folder,
                                            f'{plot_id}_target_scaled_{target_name.split(".")[0]}')
                        plot_id += 1
                    else:
                        target_train_tr = target_train_tr
                        target_val_tr = target_val_tr

                    if target_train_tr_full is None:
                        target_train_tr_full = target_train_tr
                        target_val_tr_full = target_val_tr
                    else:
                        target_train_tr_full = pd.concat([target_train_tr_full, target_train_tr], axis=1)
                        target_val_tr_full = pd.concat([target_val_tr_full, target_val_tr], axis=1)

                if use_factors:
                    factors_train, factors_val = factors_df.iloc[train_index], factors_df.iloc[val_index]
                    factors_scaler = eval(CONFIG["NORM_TRANSPORMERS"])(**CONFIG['TRANSFORMERS_ARGS'][CONFIG["NORM_TRANSPORMERS"]])
                    factors_train_tr = factors_scaler.fit_transform(factors_train.values)
                    factors_val_tr = factors_scaler.transform(factors_val.values)
                    factors_train_tr = pd.DataFrame(factors_train_tr, index=factors_train.index, columns=factors_train.columns)
                    factors_val_tr = pd.DataFrame(factors_val_tr, index=factors_val.index, columns=factors_val.columns)


                model = TS_RNN(save_dir=iter_folder,
                               n_features=factors_train_tr.shape[1] if use_factors else 0,
                               **CONFIG["MODEL"]['INIT'])
                model.logger.info(f'[Experiment] {model_name}')
                model.logger.info(f'[Experiment] {ticker_name}_TRAIN_{len(train_index)}_VAL_{len(val_index)}_TEST_{len(test_index)}')

                if isinstance(target_train_tr_full, pd.Series):
                    target_train_tr_full = target_train_tr_full.to_frame()
                if isinstance(target_val_tr_full, pd.Series):
                    target_val_tr_full = target_val_tr_full.to_frame()

                model.fit(target_train=target_train_tr_full,
                          target_val=target_val_tr_full,
                          factors_train=factors_train_tr if use_factors else None,
                          factors_val=factors_val_tr if use_factors else None,
                          **CONFIG["MODEL"]['FIT'],
                          verbose=1)

                if use_factors:
                    factors_imp_names = set()
                    for target_ind, target_name in enumerate(target_train.columns):
                        target_folder = os.path.join(iter_folder, target_name.split('.')[0])
                        # take only important factors and retrain
                        factors_names_i = feature_importance(target_train=target_train_tr_full.iloc[:, target_ind],
                                                             target_val=target_val_tr_full.iloc[:, target_ind],
                                                             factors_train=factors_train_tr if use_factors else None,
                                                             model=model,
                                                             save_dir=target_folder,
                                                             **CONFIG['FEATURE_SELECTION']
                                                               )
                        factors_imp_names.update(factors_names_i)

                    factors_train_tr = factors_train_tr[factors_imp_names]
                    factors_val_tr = factors_val_tr[factors_imp_names]

                    model = TS_RNN(save_dir=iter_folder,
                                   n_features=factors_train_tr.shape[1] if use_factors else 0,
                                   **CONFIG["MODEL"]['INIT'])

                    model.fit(target_train=target_train_tr_full,
                          target_val=target_val_tr_full,
                          factors_train=factors_train_tr if use_factors else None,
                          factors_val=factors_val_tr if use_factors else None,
                          **CONFIG["MODEL"]['FIT'],
                          verbose=1)
                    
                # Удаляем логи тюнера
                shutil.rmtree(os.path.join(iter_folder, 'TS_RNN_tuner_log'))


                for target_ind, target_name in enumerate(target_train.columns):

                    target_folder = os.path.join(iter_folder, target_name.split('.')[0])
                    plots_folder = os.path.join(target_folder, 'plots')

                    predicted_val = model.predict(factors=factors_train_tr.iloc[-model.n_lags:] if use_factors else None,
                                                  target=target_train_tr_full.iloc[-model.n_lags:, target_ind],
                                                  prediction_len=CONFIG["VAL_LEN"])
                    predicted_test = model.predict(factors=pd.concat([factors_train_tr,
                                                                      factors_val_tr], axis=0).iloc[-model.n_lags:] if use_factors else None,
                                                   target=pd.concat([target_train_tr_full, target_val_tr_full], axis=0).iloc[-model.n_lags:, target_ind],
                                                   prediction_len=CONFIG["TEST_LEN"])

                    # inverse transform
                    if CONFIG["NORM_TRANSPORMERS"] is not None:
                        target_scaler = processers_dict[target_name]['NORM_TRANSPORMERS']
                        predicted_val = pd.Series(target_scaler.inverse_transform(predicted_val.reshape(-1, 1))
                                                  .flatten(),
                                                  index=target_val.index)
                        predicted_test = pd.Series(target_scaler.inverse_transform(predicted_test.reshape(-1, 1))
                                                   .flatten(),
                                                   index=target_test.index)
                    else:
                        predicted_val = pd.Series(predicted_val, index=target_val.index)
                        predicted_test = pd.Series(predicted_test, index=target_test.index)

                    if CONFIG["BASE_TRANSFORMERS"] is not None:
                        base_transformer = processers_dict[target_name]['BASE_TRANSFORMERS']
                        predicted_val = base_transformer.inverse_transform(predicted_val)
                        if CONFIG["BASE_TRANSFORMERS"] == 'Differencer':
                            predicted_test = target_val.iloc[-1, target_ind] + predicted_test.cumsum()
                        else:
                            predicted_test = base_transformer.inverse_transform(predicted_test)

                    if CONFIG["SEASON_TRANSFORMERS"] is not None:
                        seasonal_transformer = processers_dict[target_name]['SEASON_TRANSFORMERS']
                        predicted_val = seasonal_transformer.inverse_transform(predicted_val)
                        predicted_test = seasonal_transformer.inverse_transform(predicted_test)
                    
                    predicted_val.name = target_name
                    predicted_val.to_csv(os.path.join(target_folder, f'val_predictions.csv'), mode='w', sep=';')
                    predicted_test.name = target_name
                    predicted_test.to_csv(os.path.join(target_folder, f'test_predictions.csv'), mode='w', sep=';')

                    # Calculate metrics
                    val_mertics = metrics_eval(target_val[:len(predicted_val)][target_name], predicted_val, save_dir=target_folder,
                                               print_result=False, name="val_mertics")
                    test_mertics = metrics_eval(target_test[:len(predicted_val)][target_name], predicted_test, save_dir=target_folder,
                                                print_result=False, name='test_mertics')

                    cv_val_metrics.append(val_mertics)
                    cv_test_metrics.append(test_mertics)

                    cv_val_predictions.append(predicted_val)
                    cv_test_predictions.append(predicted_test)

                    train_val_test_pred_plot(train=target_train[target_name],
                                             val=target_val[target_name],
                                             test=target_test[target_name],
                                             val_pred=predicted_val,
                                             test_pred=predicted_test,
                                             name_add=target_name,
                                             save_dir=plots_folder, show=False)

            models_dict[model_name] = {} if model_name not in models_dict else models_dict[model_name]
            models_dict[model_name]["mean_val_metrics"] = {} if "mean_val_metrics" not in models_dict[model_name] else \
            models_dict[model_name]["mean_val_metrics"]
            models_dict[model_name]["mean_test_metrics"] = {} if "mean_test_metrics" not in models_dict[model_name] else \
            models_dict[model_name]["mean_test_metrics"]
#                 models_dict[model_name]["val_predictions"] = {} if "val_predictions" not in models_dict[model_name] else \
#                 models_dict[model_name]["val_predictions"]
            models_dict[model_name]["mean_val_metrics"][target_name] = dict(pd.DataFrame(cv_val_metrics).mean())
            models_dict[model_name]["mean_test_metrics"][target_name] = dict(pd.DataFrame(cv_test_metrics).mean())
#                 models_dict[model_name]["val_predictions"][target_name] = dict(pd.DataFrame(cv_val_predictions).mean())

            end_time = time.time()
            model.logger.info(f"[Experiment] Iteration takes: {round(end_time - start_time, 2)} sec")

In [79]:
with open(os.path.join(exp_folder, "exp_result.json"), "w") as outfile:
    json.dump(models_dict, outfile, skipkeys=True)

for mode_name in ['val', 'test']:
    exp_metrics = (pd.DataFrame({model_key: pd.DataFrame(models_dict[model_key][f'mean_{mode_name}_metrics']).mean(axis=1).to_dict() for model_key in models_dict.keys()}).round(2)
                   .round(2)
                   .sort_values(by='Mean absolute percentage error', axis=1)
                   .transpose()
                   )
    exp_metrics.to_csv(os.path.join(exp_folder, f'exp_{mode_name}_metrics.csv'), mode='w', sep=';')
    display(exp_metrics)

Unnamed: 0,Mean Absolute Error,Mean Squared Error,Symmetric Mean absolute percentage error,Root Mean Squared Error,Mean absolute percentage error
LOCAL_WITH_FACTORS,37.35,5983.8,1.92,43.69,1.92
LOCAL_WITHOUT_FACTORS,36.82,6132.89,1.93,43.45,1.92
GLOBAL_WITHOUT_FACTORS,137.18,63201.3,7.35,157.46,6.91
GLOBAL_WITH_FACTORS,139.9,65764.89,7.5,160.09,7.04


Unnamed: 0,Mean Absolute Error,Mean Squared Error,Symmetric Mean absolute percentage error,Root Mean Squared Error,Mean absolute percentage error
LOCAL_WITH_FACTORS,30.31,3272.6,2.02,37.71,2.0
LOCAL_WITHOUT_FACTORS,32.01,3947.23,2.1,39.95,2.07
GLOBAL_WITHOUT_FACTORS,150.87,75732.38,7.66,171.63,7.21
GLOBAL_WITH_FACTORS,155.39,79798.76,7.94,176.34,7.46
