In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.width = None
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 20)
import datetime
from psutil import Process
from IPython.display import display_html 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

# About

#### In this notebook, I used trade data from stock market for tech stocks Apple, Amazon, Microsoft, Nvidia, Tesla and Google
#### I've used Huge Dataset for US stocks to get Extra stock information - it's will be useful as features for neural network
#### To visualize I'm using Seaborn and Matplotlib
#### Long Short Term Memory(LSTM) is used to predict future stock prices

# Why LSTM

Behaviour stock prices is going through time, all data OHLCV is a time series data and can't be shuffle, as future price is depended on past.

In mathematics, a time series is a series of data points indexed (or listed or graphed) in time order.

As say tensorflow.org/tutorials/structured_data/time_series - a Recurrent Neural Network (RNN) is a type of neural network well-suited to time series data. 

RNNs process a time series step-by-step, maintaining an internal state from time-step to time-step.

So we try here to use LSTM - an RNN layer called Long Short-Term Memory.

# Config

#### Let's get daily prices OHLCV for Apple, Amazon, Microsoft, Nvidia, Tesla and Google

In [None]:
# you can change config here)))
tickers = ["AAPL.US", "AMZN.US", "MSFT.US", "NVDA.US", "TSLA.US", "GOOGL.US", ]
names = ["Apple", "Amazon", "Microsoft", "Nvidia", "Tesla", "Google"]
timeframe = "D1"

# Get Market Data

#### Do it just for Apple - to see it's fields and data

In [None]:
ticker = "AAPL.US"
df_AAPL = pd.read_csv(f"stock_data/{timeframe}/{ticker}_{timeframe}.csv")
df_AAPL

# 1298 Columns

#### You see we have a lot of columns

In [None]:
print(list(df_AAPL.columns))

# Description

#### From descriptioon of this https://www.kaggle.com/datasets/olegshpagin/extra-us-stocks-market-data dataset:
**Columns with Momentum Indicator** values ADX - Average Directional Movement Index ADXR - Average Directional Movement Index Rating APO - Absolute Price Oscillator AROON - Aroon AROONOSC - Aroon Oscillator BOP - Balance Of Power CCI - Commodity Channel Index CMO - Chande Momentum Oscillator DX - Directional Movement Index MACD - Moving Average Convergence/Divergence MACDEXT - MACD with controllable MA type MACDFIX - Moving Average Convergence/Divergence Fix 12/26 MFI - Money Flow Index MINUS_DI - Minus Directional Indicator MINUS_DM - Minus Directional Movement MOM - Momentum PLUS_DI - Plus Directional Indicator PLUS_DM - Plus Directional Movement PPO - Percentage Price Oscillator ROC - Rate of change : ((price/prevPrice)-1)100 ROCP - Rate of change Percentage: (price-prevPrice)/prevPrice ROCR - Rate of change ratio: (price/prevPrice) ROCR100 - Rate of change ratio 100 scale: (price/prevPrice)100 RSI - Relative Strength Index STOCH - Stochastic STOCHF - Stochastic Fast STOCHRSI - Stochastic Relative Strength Index TRIX - 1-day Rate-Of-Change (ROC) of a Triple Smooth EMA ULTOSC - Ultimate Oscillator WILLR - Williams' %R

**Columns with Volatility Indicator** values ATR - Average True Range NATR - Normalized Average True Range TRANGE - True Range

**Columns with Volume Indicator** values AD - Chaikin A/D Line ADOSC - Chaikin A/D Oscillator OBV - On Balance Volume

**Columns with Overlap Studies** values BBANDS - Bollinger Bands DEMA - Double Exponential Moving Average EMA - Exponential Moving Average HT_TRENDLINE - Hilbert Transform - Instantaneous Trendline KAMA - Kaufman Adaptive Moving Average MA - Moving average MAMA - MESA Adaptive Moving Average MAVP - Moving average with variable period MIDPOINT - MidPoint over period MIDPRICE - Midpoint Price over period SAR - Parabolic SAR SAREXT - Parabolic SAR - Extended SMA - Simple Moving Average T3 - Triple Exponential Moving Average (T3) TEMA - Triple Exponential Moving Average TRIMA - Triangular Moving Average WMA - Weighted Moving Average

**Columns with Cycle Indicator** values HT_DCPERIOD - Hilbert Transform - Dominant Cycle Period HT_DCPHASE - Hilbert Transform - Dominant Cycle Phase HT_PHASOR - Hilbert Transform - Phasor Components HT_SINE - Hilbert Transform - SineWave HT_TRENDMODE - Hilbert Transform - Trend vs Cycle Mode

# Selecting cols

#### Let's choose only several for our code
  - usual: datetime, open, high, low, close, volume 
  - and extra fields from 1298 columns: Rate of change (periods 10, 20, 50): **roc_10, roc_20, roc_50** + Simple Moving Average (periods 10, 20, 50): **sma_10, sma_20, sma_50**

In [None]:
selected_columns = ["datetime", "open", "high", "low", "close", "volume", "roc_10", "roc_20", "roc_50", "sma_10", "sma_20", "sma_50"]

#### Do it for all shares

In [None]:
df_all = {}
for ticker in tickers:
    df_all[ticker] = pd.read_csv(f"stock_data/{timeframe}/{ticker}_{timeframe}.csv")
    df_all[ticker] = df_all[ticker][selected_columns]  # filter only needed columns from 1298

# SIDE by SIDE

#### Let's see on our dataframes ones in SIDE by SIDE
#### P.S. If you copy|edit this notebook, then you see these three dataframes in side by side 
#### To simplify - we create function for this

In [None]:
def show_dfs_in_side_by_side(dfs, captions):
    _disp_dfs = []
    for i in range(len(dfs)):
        _df = dfs[i]
        _caption = captions[i]
        _df_styler = _df.style.set_table_attributes("style='display:inline'").set_caption(_caption)
        _disp_dfs.append(_df_styler._repr_html_())
    display_html(_disp_dfs, raw=True)

In [None]:
dict_values = list(df_all.values())
dict_names = list(df_all.keys())

#### Just get one df to check

In [None]:
dfs=dict_values[0]
print(type(dfs))
print(dfs.info())
dfs

In [None]:
n = 30
dict_values_n = [dict_values[i][:n] for i in range(len(dict_values))]  # limit to n rows

#### To see all dataframes in side by side

In [None]:
show_dfs_in_side_by_side(dfs=dict_values_n, captions=dict_names)
# P.S. If you copy|edit this notebook, then you see these three dataframes in side by side 

# Data exploring

* the data is numeric: open, high, low, close and extra fields - are float, type of volume is int
* "datetime" is the index of the data
* weekends are missing from the records of the data

# Data compressing

#### we can free memory we are using by ~50%, let's get corresponding function from https://www.kaggle.com/code/olegshpagin/how-to-reduce-memory-usage-for-dataframe

In [None]:
def reduce_mem_usage(df, verbose=0):
    """function to reduce memory usage for dataframe"""
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        try:
            col_type = df[col].dtype
            if col_type != object:
                c_min = df[col].min()
                c_max = df[col].max()
                
                if str(col_type)[:3] == "int":
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float32)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float32)
        except:
            pass
    if verbose:
        print(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        print(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        print(f"Decreased by {decrease:.2f}%")
    return df

# How much memory we are using now?

In [None]:
def checkMemory():
    """This function defines the memory usage across the kernel. Source - https://stackoverflow.com/questions/61366458/how-to-find-memory-usage-of-kaggle-notebook"""
    pid = os.getpid()
    py = Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return f"RAM memory GB usage = {memory_use :.4}"
checkMemory()

In [None]:
# applying function to reduce memory use for one df of Apple dataframe
df_AAPL = reduce_mem_usage(df_AAPL, verbose=1)

In [None]:
df_AAPL

#### Now we see really good improvements for our memory costs: It was decreased by 49.96%!

In [None]:
checkMemory()

#### you can apply it to huge dataframes and get a lot of memory free

# Charts

#### The Closing Price is the final price for trading day. Investors use this price to measure profit/loss across time.

# Close

## Close Price charts

In [None]:
num_prices = 365  # how many days to show

plt.figure(figsize=(20, 15))
sns.set_style('darkgrid')
plt.subplots_adjust(top=1.25, bottom=1.2)

for i, ticker in enumerate(df_all, 1):
    plt.subplot(3, 2, i)
    df_all[ticker]['close'][len(df_all[ticker]['close'])-num_prices:].plot()
    plt.ylabel('close')
    plt.xlabel('days')
    plt.title(f"closing price of {names[i - 1]}")

plt.tight_layout()

# Volume

## Volume charts

In [None]:
num_prices = 365  # how many days to show

plt.figure(figsize=(20, 15))
sns.set_style('darkgrid')
plt.subplots_adjust(top=1.25, bottom=1.2)

for i, ticker in enumerate(df_all, 1):
    plt.subplot(3, 2, i)
    plt.bar(list(range(num_prices)), df_all[ticker]['volume'][len(df_all[ticker]['volume'])-num_prices:], width=0.7)
    plt.ylabel('volume')
    plt.xlabel('days')
    plt.title(f"volume of {names[i - 1]}")

plt.tight_layout()

# SMA 10,20,50:

## Close + SMA(10, 20, 50) charts

In [None]:
num_prices = 365  # how many days to show

plt.figure(figsize=(20, 15))
sns.set_style('darkgrid')
plt.subplots_adjust(top=1.25, bottom=1.2)

for i, ticker in enumerate(df_all, 1):
    plt.subplot(3, 2, i)
    df_all[ticker]['close'][len(df_all[ticker]['close'])-num_prices:].plot()
    df_all[ticker]['sma_10'][len(df_all[ticker]['sma_10'])-num_prices:].plot()
    df_all[ticker]['sma_20'][len(df_all[ticker]['sma_20'])-num_prices:].plot()
    df_all[ticker]['sma_50'][len(df_all[ticker]['sma_50'])-num_prices:].plot()
    plt.ylabel('close')
    plt.xlabel('days')
    plt.title(f"closing price of {names[i - 1]}")

plt.tight_layout()

# ROC 10,20,50:

## Rate of change charts

In [None]:
num_prices = 365  # how many days to show

plt.figure(figsize=(20, 15))
sns.set_style('darkgrid')
plt.subplots_adjust(top=1.25, bottom=1.2)

for i, ticker in enumerate(df_all, 1):
    plt.subplot(3, 2, i)
    df_all[ticker]['roc_10'][len(df_all[ticker]['roc_10'])-num_prices:].plot()
    df_all[ticker]['roc_20'][len(df_all[ticker]['roc_20'])-num_prices:].plot()
    df_all[ticker]['roc_50'][len(df_all[ticker]['roc_50'])-num_prices:].plot()
    plt.ylabel('roc')
    plt.xlabel('days')
    plt.title(f"Rate of change {names[i - 1]}")

plt.tight_layout()

## Rate of change histogram

In [None]:
num_prices = 365  # how many days to show

plt.figure(figsize=(20, 15))
sns.set_style('darkgrid')
plt.subplots_adjust(top=1.25, bottom=1.2)

for i, ticker in enumerate(df_all, 1):
    plt.subplot(3, 2, i)
    df_all[ticker]['roc_10'][len(df_all[ticker]['roc_10'])-num_prices:].hist(bins=50)
    df_all[ticker]['roc_20'][len(df_all[ticker]['roc_20'])-num_prices:].hist(bins=50)
    df_all[ticker]['roc_50'][len(df_all[ticker]['roc_50'])-num_prices:].hist(bins=50)
    plt.ylabel('roc')
    plt.xlabel('days')
    plt.title(f"Rate of change {names[i - 1]}")

plt.tight_layout()

# Correlation

#### Correlation in one df of columns for Apple

In [None]:
# building a correlogram - to find correlation between shares
sns.set(font_scale=0.9) # font size 2
df_filtered = df_AAPL.copy()
df_filtered = df_filtered[selected_columns]  # if not filter our columns - it goes crazy to show +1200 cols )))
df_filtered = df_filtered.drop('datetime', axis=1)
plt.figure(figsize=(18, 16), dpi=80)
sns.heatmap(df_filtered.corr(), xticklabels=df_filtered.corr().columns, yticklabels=df_filtered.corr().columns, cmap='RdYlGn', center=0, annot=True, fmt=".3f")
plt.title(f'Correlatogram of columns for Apple')
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(fontsize=10)
plt.show()

#### As we have big correlation in open, high, low, close and SMAs, so it's enough to use one value: close + we can use roc_10

#### Correlation between shares by Close price
#### see more here https://www.kaggle.com/code/olegshpagin/how-to-find-a-correlation-between-shares

In [None]:
# let's download data again in one df to show correalation
all_dfs = pd.DataFrame()  # df with col: close and roc_10
all_close = pd.DataFrame()  # df with col: close
for ticker in tickers:
    df = pd.read_csv(f"stock_data/{timeframe}/{ticker}_{timeframe}.csv")
    df = df[selected_columns]  # filter only needed columns from 1298
    df_close = df[["close"]]
    df = df.add_suffix(f'_{ticker}')
    df_close = df_close.add_suffix(f'_{ticker}')
    all_dfs = pd.concat([all_dfs, df[[f'close_{ticker}', f'roc_10_{ticker}']]], axis=1)  # we take only close column
    all_close = pd.concat([all_close, df_close[f'close_{ticker}']], axis=1)  # we take only close column
all_dfs.sort_index(axis=0)


In [None]:
all_close.sort_index(axis=0)

In [None]:
tickers_str = ', '.join(tickers)
# building a correlogram - to find correlation between shares
sns.set(font_scale=0.9) # font size 2
df_filtered = all_dfs.copy()
# df_filtered = df_filtered.drop('datetime', axis=1)
plt.figure(figsize=(18, 16), dpi=80)
sns.heatmap(df_filtered.corr(), xticklabels=df_filtered.corr().columns, yticklabels=df_filtered.corr().columns, cmap='RdYlGn', center=0, annot=True, fmt=".3f")
plt.title(f'Correlatogram of columns for {tickers_str}')
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(fontsize=10)
plt.show()

# Correlation (view 2)

#### Let's compare the daily percentage return for two stocks - how they are correlated 

In [None]:
close_returns = all_close.pct_change()
close_returns.head()

In [None]:
sns.jointplot(x='close_AAPL.US', y='close_MSFT.US', data=close_returns, kind='scatter', color='seagreen')

#### We  see that close price for Apple has outliers, let's check it

In [None]:
df_AAPL_check = close_returns["close_AAPL.US"]
df_AAPL_check.describe()

#### max value is 6!!! wow! it means was 600% daily change of price

In [None]:
all_close['close_AAPL.US'].describe()

In [None]:
# show us how many values to specific len_seq
_uniq = np.unique(close_returns["close_AAPL.US"].values, return_counts=True)
# let's see last 100 values
last_100 = _uniq[0][len(_uniq[0])-100:]
last_100

#### we see - last value - wasn't calculated???

In [None]:
# show us how many values to specific len_seq
_uniq = np.unique(close_returns["close_AMZN.US"].values, return_counts=True)
# let's see last 100 values
last_100 = _uniq[0][len(_uniq[0])-100:]
last_100

In [None]:
sns.jointplot(x='close_AMZN.US', y='close_MSFT.US', data=close_returns, kind='scatter', color='seagreen')

#### We can compare every stock with others in our tickers list

In [None]:
sns.pairplot(close_returns, kind='reg')

#### We need to remove outlier 6 value from Apple

In [None]:
close_returns["close_AAPL.US"] = [close_returns["close_AAPL.US"].loc[i] if close_returns["close_AAPL.US"].loc[i]<1 else np.nan for i in range(len(close_returns["close_AAPL.US"]))]

#### Now redraw correlations

In [None]:
sns.pairplot(close_returns, kind='reg')

#### Now we can see in numbers that almost all these stocks have strong correlation (except Tesla) / from correlogram

# Prediction

#### Prediction for close price of Apple

# Prepare data

#### let's see full df for Apple

In [None]:
df_AAPL

#### we will use only several columns for prediction

In [None]:
print(selected_columns)
df_AAPL_filtered = df_AAPL[selected_columns]

In [None]:
data_split_percent = 0.9  # 90% to train and 10% to validate

In [None]:
# create a new dataframe with only the close+roc_10 columns
data = df_AAPL_filtered[['close', 'roc_10']]
# convert the dataframe to a numpy array
dataset = data.values
# get the number of rows to train the model on
training_data_len = int(np.ceil( len(dataset) * data_split_percent ))

training_data_len

In [None]:
dataset

In [None]:
data

# Normalize data

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)

In [None]:
scaled_data

# Split data 

#### Train data

In [None]:
train_data = scaled_data[0:int(training_data_len), :]

In [None]:
train_data

In [None]:
pd.DataFrame(train_data).head(25)

In [None]:
# their length isn't equal as we use 95% of dataset for train_data
print(len(dataset))
print(len(train_data))

In [None]:
train_data.shape

#### Test data

In [None]:
test_data = scaled_data[int(training_data_len):, :]

In [None]:
test_data

In [None]:
# their length isn't equal as we use 95% of dataset for test_data
print(len(dataset))
print(len(train_data))
print(len(test_data))
print(len(train_data)+len(test_data))

In [None]:
test_data.shape

# Train dataset

In [None]:
size_of_train_set = 150

#### Right here we are forming N-dimensional input

In [None]:
# split the data into x_train and y_train data sets
x_train = []
y_train = []

for i in range(size_of_train_set, len(train_data)):
    x_train.append(train_data[i-size_of_train_set:i, :])  # by use ":" - we are forming N-dimensional input
    y_train.append(train_data[i, 0])  # output - is just next close scalered by MinMaxScaler
    if i<= size_of_train_set or i==size_of_train_set*10 or i==(size_of_train_set*10+1):
        print(x_train[-1])
        print(y_train[-1])
        print()

In [None]:
# convert the x_train and y_train to numpy arrays
x_train, y_train = np.array(x_train), np.array(y_train)

In [None]:
x_train

In [None]:
y_train

In [None]:
print(x_train.shape)
print(y_train.shape)

# Test dataset

In [None]:
# split the data into x_test and y_test data sets
x_test = []
y_test = []

for i in range(size_of_train_set, len(test_data)):
    x_test.append(test_data[i-size_of_train_set:i, :])  # by use ":" - we are forming N-dimensional input
    y_test.append(test_data[i, 0])  # output - is just next close scalered by MinMaxScaler
    if i<= size_of_train_set or i==size_of_train_set*10 or i==(size_of_train_set*10+1):
        print(x_test[-1])
        print(y_test[-1])
        print()

In [None]:
# convert the x_test and y_test to numpy arrays
x_test, y_test = np.array(x_test), np.array(y_test)

In [None]:
x_test

In [None]:
y_test

In [None]:
print(x_test.shape)
print(y_test.shape)

#### **=======================================================================================================================**

# Data to train

#### 1-dimension

In [None]:
# create a new dataframe with only the close+roc_10 columns
data = df_AAPL_filtered[['close', ]][len(df_AAPL_filtered)-3000:]  # let's get only last 3000 days
data

In [None]:
# convert the dataframe to a numpy array
dataset = data.values
dataset

# Normalize data

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)

# Params for split data

In [None]:
def split_data_into_buckets(data, size_of_set):
    xx, yy = [], []
    for i in range(size_of_set, len(data)):
        xx.append(data[i-size_of_set:i, :])  # by use ":" - we are forming N-dimensional input
        yy.append(data[i, 0])  # output - is just next close scalered by MinMaxScaler        
    return xx, yy
        
def split_data_for_nn(data_to_split, size_of_set, train_percent, val_percent, test_percent):
    all_len=len(data_to_split)-1  # -1 to get future price of closing
    train_data_len = int(np.ceil(len(dataset) * train_percent))
    val_data_len = int(np.ceil(len(dataset) * val_percent))
    test_data_len = int(np.ceil(len(dataset) * test_percent))
    if train_data_len+val_data_len+test_data_len > all_len: test_data_len = all_len - train_data_len - val_data_len
    print(train_data_len, val_data_len, test_data_len, "+=", train_data_len+val_data_len+test_data_len, "all_len:", all_len)
    
    train_data = scaled_data[0:train_data_len, :]
    val_data = scaled_data[train_data_len:train_data_len+val_data_len, :]
    test_data = scaled_data[train_data_len+val_data_len:train_data_len+val_data_len+test_data_len, :]
    print(train_data.shape, val_data.shape, test_data.shape)
    
    x_train, y_train = split_data_into_buckets(train_data, size_of_set)
    x_val, y_val = split_data_into_buckets(val_data, size_of_set)
    x_test, y_test = split_data_into_buckets(test_data, size_of_set)
    
    x_train, y_train = np.array(x_train), np.array(y_train)
    x_val, y_val = np.array(x_val), np.array(y_val)
    x_test, y_test = np.array(x_test), np.array(y_test)
    
    return x_train, y_train, x_val, y_val, x_test, y_test, train_data_len, val_data_len, test_data_len

In [None]:
size_of_set = 60
x_train, y_train, x_val, y_val, x_test, y_test, train_data_len, val_data_len, test_data_len = split_data_for_nn(data_to_split=scaled_data, size_of_set=size_of_set, train_percent=0.7, val_percent=0.2, test_percent=0.1)
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

# Params for NN

In [None]:
batch_size = 1  # x_train.shape[0] // 10
epochs = 1  # 250
print(f"batch_size={batch_size}, epochs={epochs}")
LR = 0.05  # 5e-2  # learning rate of the gradient descent
LAMBD = 0.03  # 3e-2  # lambda in L2 regularizaion
DP = 0.2  # dropout rate
RDP = 0.2  # recurrent dropout rate

# Model LSTM

#### Libraries for model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, BatchNormalization, Dropout
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.regularizers import l2

#### Build the LSTM model

In [None]:
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape= (x_train.shape[1], x_train.shape[2])))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))

#### Compile the model

In [None]:
model.compile(optimizer='adam', loss='mean_squared_error')
# model.compile(loss='mean_squared_error',
#                       metrics=['accuracy'],
#                       optimizer=Adam(learning_rate=LR))

In [None]:
# Define a learning rate decay method:
lr_decay = ReduceLROnPlateau(monitor='loss',
                             patience=1, verbose=0,
                             factor=0.5, min_lr=1e-8)
# Define Early Stopping:
early_stop = EarlyStopping(monitor='val_loss', min_delta=0,
                           patience=30, verbose=1, mode='auto',
                           baseline=0, restore_best_weights=True)

#### Train the model

In [None]:
model.fit(x_train, y_train, batch_size=1, epochs=1)
# model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
#           validation_split=0.0,
#           validation_data=(x_val, y_val),
#           verbose=1,
#           callbacks=[lr_decay, early_stop])

#### scaler.inverse_transform from one dimension to two
#### workaround is function below - invTransform

In [None]:
# https://stackoverflow.com/questions/53049396/sklearn-inverse-transform-return-only-one-column-when-fit-to-many
# - scaler   = the scaler object (it needs an inverse_transform method)
# - data     = the data to be inverse transformed as a Series, ndarray, ... 
#              (a 1d object you can assign to a df column)
# - ftName   = the name of the column to which the data belongs
# - colNames = all column names of the data on which scaler was fit 
#              (necessary because scaler will only accept a df of the same shape as the one it was fit on)
def invTransform(scaler, data, colName, colNames):
    dummy = pd.DataFrame(np.zeros((len(data), len(colNames))), columns=colNames)
    dummy[colName] = data
    colNames = dummy.columns
    dummy = pd.DataFrame(scaler.inverse_transform(dummy), columns=colNames)
    return dummy[colName].values

# Test predict on val

In [None]:
# Get the models predicted price values
predictions = model.predict(x_val)
# print(predictions)
predictions = scaler.inverse_transform(predictions)  # only one dimension
# predictions = invTransform(scaler=scaler, data=predictions, colName="close", colNames=["roc_10", ])

predictions_val = predictions

# Get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(((predictions - y_val) ** 2)))
print("rmse:", rmse)

In [None]:
print(len(x_val))
print(len(predictions_val.flatten().tolist()))
# print(type(predictions_val))

In [None]:
print(len(predictions_val))

# Test predict on test

In [None]:
# Get the models predicted price values
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)

predictions_test = predictions

# Get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(((predictions_test - y_test) ** 2)))
print("rmse:", rmse)

# Plot the data

In [None]:
print(train_data_len, val_data_len, test_data_len)

In [None]:
# data_close = data["close"].values.tolist()
_data = data.filter(["close"])
print(len(_data))

In [None]:
_data

In [None]:
train_dataset = _data[0:train_data_len+size_of_set]
val_dataset = _data[train_data_len+size_of_set:train_data_len+val_data_len]
test_dataset = _data[train_data_len+size_of_set+val_data_len:train_data_len+val_data_len+test_data_len]

In [None]:
valid = val_dataset
valid['predictions'] = predictions_val

In [None]:
# Visualize the data
plt.figure(figsize=(16,6))
plt.title('Model')
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.plot(train_dataset)
plt.plot(valid[['close', 'predictions']])
plt.legend(['train', 'val', 'predictions'], loc='lower right')
plt.show()

# Valid vs Predicted

In [None]:
valid

# Improvements

In [None]:
# here will be next code


# To be continued
#### P.S. This notebook in a process of creating, please be patient. Thanks.

# It is NOT Finished jet))

#### Wow... to bad results in predictions

#### Feel free to leave comment. Thanks.