# Keras LSTM Pipeline using all stocks

## Setup

In [2]:
%load_ext autoreload
%autoreload 2

from notebook_config import setup_notebook
import matplotlib.pyplot as plt
import numpy as np
from tqdm.notebook import tqdm

setup_notebook()

In [3]:
from datasets.stocks_data_wrapper import StocksDataWrapper
from helpers.data_helper import *
from helpers.plots_helper import *

In [16]:
DATA_PATH = '../../data/'
FILE_SUFFIX='.txt'

quotation = 'GOOG'
price_column = 'Close'
project_label='Pipeline'

OUTPUT_PATH = 'results/'
N_CLASSES = 2
METRICS_PATH = f"{OUTPUT_PATH}metrics/{N_CLASSES}_classes/"
METRICS_PATH = f"{OUTPUT_PATH}metrics/regression/"
PIPELINE_LABEL = 'LSTM'


THRESH_DIFF = 0.005 if N_CLASSES is 3 else None

## Prepare Data

In [17]:
y_column = 'Next'
seq_len = 5
n_splits=5
val_size = 0.2

features = ['Volume', 'Close', 'LowLen', 'Difference', 'SMA(20) - SMA(10)', 'BG_H_Band_Indicator', 'MACD']

In [18]:
PERFORM_GRID_SEARCH = True
MAX_SIZE = 5000
n_splits=5

quotations = ['AAL', 'AAPL', 'AMZN','CMCSA', 'COST', 'GM', 'GOOG', 'IBM', 'JNJ', 'KO','PEP', 'TSLA', 'WMT', 'XOM']
days_predict_feature_set = {
    1:['Volume', 'Close', 'LowLen', 'Difference', 'BG_L_Band', 'GAP', 'MACD_diff'],
    5:['Volume', 'Close', 'LowLen', 'Difference', 'SMA(20) - SMA(10)', 'BG_H_Band_Indicator', 'MACD'],
    10:['Volume', 'Close', 'BodyLen', 'Difference', 'SMA(20) - SMA(10)', 'EMA_Diff', 'MACD_diff'],
    20:['Volume', 'Close', 'LowLen', 'Difference', 'EMA(14)', 'BG_H_Band', 'MACD_diff'],
    50:['Volume', 'Close', 'LowLen', 'Difference', 'BG_L_Band','GAP','MACD_diff'],
}

## Computation for all stocks

In [19]:
predict_n_metrics_dict = {}

In [36]:
from keras.preprocessing import timeseries_dataset_from_array as build_timeseries_ds
from trainers.keras_regression_trainer import KerasRegressionTrainer
from models.keras_lstm_model import LSTMModel
import tensorflow as tf


config_dict = dict(
    quotation=quotation,
    predict_n = 1,
    features=features,
    learning_rate = 0.01,
    batch_size = 32,
    sequence_length=seq_len,
    n_epochs = 100,
    n_splits = n_splits,
    val_size = val_size,
)

for predict_n, features_list in tqdm(days_predict_feature_set.items()):
    quot_metrics_dict = {}
    for quot in tqdm(quotations):
        data_wrapper = StocksDataWrapper.read_from(f"{DATA_PATH}{quot}{FILE_SUFFIX}", 
                                                   compute_features=True, 
                                                   predict_n=predict_n, 
                                                   thresh_diff=None, normalize=True)
        config_dict['quotation'] = quot


        X_train, X_test, y_train, y_test = data_wrapper.get_datasets(n_splits=1, val_size=0.2,
                                                                     y_column='NextPrice', sequences=True, seq_len=5,
                                                                     features_list=features_list)
        n_classes = len(np.unique(y_train))
    
        train_set = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(config_dict['batch_size'])
        test_set = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(config_dict['batch_size'])

        model = LSTMModel(config_dict, seq_len, len(features), output_dim=1,
                          learning_rate=config_dict['learning_rate'], loss='mse', metrics=['accuracy'])
        trainer = KerasRegressionTrainer(model, use_wandb=False, project_label=project_label)
        trainer.train(train_set)
        
        metrics = trainer.evaluate(test_set)
        label_dict = {'LSTM':metrics}
        quot_metrics_dict[quot] = label_dict
        
    predict_n_metrics_dict[predict_n] = quot_metrics_dict.copy()
        

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

In [38]:
print(predict_n_metrics_dict)

{1: {'AAL': {'LSTM': {'mse': 0.005345122227051616, 'mae': 0.05257157798733653, 'mape': 0.21990598038471207}}, 'AAPL': {'LSTM': {'mse': 0.017545464383322218, 'mae': 0.0812107074589214, 'mape': 0.1249528339729033}}, 'AMZN': {'LSTM': {'mse': 0.05575904951941301, 'mae': 0.08353973285680166, 'mape': 0.1069917737014646}}, 'CMCSA': {'LSTM': {'mse': 2.0211231027007273, 'mae': 0.579481754745886, 'mape': 0.7118292490721969}}, 'COST': {'LSTM': {'mse': 0.00815555524013827, 'mae': 0.05274442761851318, 'mape': 0.0694132219238792}}, 'GM': {'LSTM': {'mse': 0.00045101721871451585, 'mae': 0.014615910434787077, 'mape': 112858393628.61473}}, 'GOOG': {'LSTM': {'mse': 0.00037475664051470303, 'mae': 0.016120389774140407, 'mape': 0.02399854129077886}}, 'IBM': {'LSTM': {'mse': 0.01849880040624655, 'mae': 0.10918556610848268, 'mape': 0.138286006918422}}, 'JNJ': {'LSTM': {'mse': 0.0005046457153907478, 'mae': 0.01666287176379775, 'mape': 0.020603070627265976}}, 'KO': {'LSTM': {'mse': 0.00038719923099188856, 'mae'

### Add to metrics

In [39]:
print(f"{METRICS_PATH}{PIPELINE_LABEL}_dict.txt")

results/metrics/regression/LSTM_dict.txt


In [40]:
save_dict(dict_save=predict_n_metrics_dict, path=f"{METRICS_PATH}{PIPELINE_LABEL}_dict.txt")

In [41]:
previous_metrics = read_dict(f"{METRICS_PATH}full_metrics_dict.txt")

previous_metrics = {int(k):v for k,v in previous_metrics.items()}
previous_metrics[1]

{'AAL': {'DTree': {'mse': 0.00040002611775352936,
   'mae': 0.01517712473316475,
   'mape': 0.03865700863537383},
  'RandomForest': {'mse': 0.00026469454356206737,
   'mae': 0.011830471121597129,
   'mape': 0.0317035815048061},
  'SVM': {'mse': 0.0012326616718660055,
   'mae': 0.027130217484537124,
   'mape': 0.08522520213496947},
  'LR': {'mse': 0.0002459189601130199,
   'mae': 0.011412489432978266,
   'mape': 0.03143306038231758},
  'LinearNN': {'mse': 0.17441429197788239,
   'mae': 0.3135688900947571,
   'mape': 1.242722988128662}},
 'AAPL': {'DTree': {'mse': 0.0639864274135731,
   'mae': 0.1675341345697533,
   'mape': 0.5330979677282099},
  'RandomForest': {'mse': 0.06479528918158887,
   'mae': 0.1696178740919568,
   'mape': 0.5438718114409096},
  'SVM': {'mse': 0.13882785240228357,
   'mae': 0.31834349124692446,
   'mape': 1.9994437529407039},
  'LR': {'mse': 0.0001616190201255988,
   'mae': 0.007981748225476632,
   'mape': 0.016132374904576908},
  'LinearNN': {'mse': 0.7655281424

In [43]:
full_metrics = previous_metrics.copy()
merge_metric_dicts(full_metrics, predict_n_metrics_dict)
        
save_dict(full_metrics, f"{METRICS_PATH}full_metrics_dict.txt")

In [44]:
from helpers.data_helper import save_predictions_heatmaps

save_predictions_heatmaps(path=METRICS_PATH, metrics_dict=full_metrics, metrics_names_list=['mse', 'mae', 'mape'], reversed=True)

In [16]:
for predict_n, quot_metrics in full_metrics.items():
    metrics_df = pd.DataFrame.from_dict(quot_metrics).T
    #print(metrics_df)
    acc_df = metrics_df.applymap(lambda metrics: metrics['acc'])
    f1_df = metrics_df.applymap(lambda metrics: metrics['f1'])
    
    plt.figure()
    acc_heatmap = sns.heatmap(acc_df, cmap ='mako', linewidths = 0.5, annot = True)
    acc_heatmap.figure.savefig(f"{METRICS_PATH}{predict_n}_acc.png")
    plt.close()
    
    plt.figure()
    f1_heatmap = sns.heatmap(f1_df, cmap ='mako', linewidths = 0.5, annot = True)
    f1_heatmap.figure.savefig(f"{METRICS_PATH}{predict_n}_f1.png")
    plt.close()