## Imports

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, GRU, SimpleRNN, Conv1D
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
plt.rcParams.update({'figure.figsize': (16, 9)})
import matplotlib
import sklearn
import keras

In [4]:
!pip install ta
import ta

Collecting ta
  Downloading ta-0.7.0.tar.gz (25 kB)
Building wheels for collected packages: ta
  Building wheel for ta (setup.py): started
  Building wheel for ta (setup.py): finished with status 'done'
  Created wheel for ta: filename=ta-0.7.0-py3-none-any.whl size=28718 sha256=1f29583a73f835cb3f051ccef121fbdd9aedf7555dce2c7849aa6b5f34f95ed8
  Stored in directory: c:\users\mashfiqur\appdata\local\pip\cache\wheels\bb\7c\a0\9c72e50ddef1f7c3d9003bf4ccc5d5c8deb24828d4eb156fc8
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.7.0


## Load data

In [45]:
data = pd.read_csv('../data/processed/company_id_94_processed.csv', parse_dates=True)
data = data[['txn_date', 
             'open', 'high', 'low', 
             'close',
             'vol', 
#              'month', 
             'day', 
#              'day_of_month'
            ]].sort_values(by='txn_date')
data['txn_date'] = pd.to_datetime(data['txn_date'])
# data = data[data['txn_date']< '2020-03-01']
data.set_index('txn_date', inplace=True, drop=True)
data['close_roc'] = data['close'].pct_change()
data['close_log_roc'] = np.log(1+data['close_roc'])
data['rsi'] = ta.momentum.RSIIndicator(close=data['close'], fillna=True).rsi()
data['macd_signal'] = ta.trend.MACD(close=data['close'], fillna=True).macd_signal()
# data['macd_signal'] = ta.trend.MACD(close=data['close'], fillna=True).macd_signal()
data['chaikin_money_flow'] = ta.volume.ChaikinMoneyFlowIndicator(high=data['high'], low=data['low'], close=data['close'], volume=data['vol'], fillna=True).chaikin_money_flow()
data.dropna(inplace=True)
# data = data.resample('W').mean()
data.head()

Unnamed: 0_level_0,open,high,low,close,vol,day,close_roc,close_log_roc,rsi,macd_signal,chaikin_money_flow
txn_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1999-01-11,58.9,58.9,58.0,58.3,0.0,0,0.003442,0.003436,100.0,0.003191,0.0
1999-01-12,58.6,58.7,58.4,58.5,0.0,1,0.003431,0.003425,100.0,0.011398,0.0
1999-01-13,58.6,58.7,58.5,58.6,0.0,2,0.001709,0.001708,100.0,0.023889,0.0
1999-01-14,58.5,59.5,58.1,58.8,0.0,3,0.003413,0.003407,100.0,0.041546,0.0
1999-01-23,59.0,59.0,58.6,58.7,0.0,5,-0.001701,-0.001702,85.310605,0.059825,0.0


## Preprocessing

In [6]:
# data.info()

In [7]:
# data.describe()

In [8]:
# min(data.index), max(data.index), data.shape[0]

In [9]:
# data['chaikin_money_flow'].plot()
# plt.xlabel('Date')
# plt.ylabel('Price')
# plt.legend(['Close Price'])
# plt.show()

In [10]:
# data['close_roc'].plot()
# plt.xlabel('Date')
# plt.ylabel('Close Price Rate Of Change')
# plt.show()

In [11]:
# data['vol'].plot()
# plt.xlabel('Date')
# plt.ylabel('Trade Volume')
# plt.show()

In [12]:
from sklearn import metrics
def timeseries_evaluation_metrics_func(y_true, y_pred):
    def mean_absolute_percentage_error(y_true, y_pred):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    print('Evaluation metric results:-')
    print(f'MSE is : {metrics.mean_squared_error(y_true, y_pred)}')
    print(f'MAE is : {metrics.mean_absolute_error(y_true, y_pred)}')
    print(f'RMSE is : {np.sqrt(metrics.mean_squared_error(y_true, y_pred))}')
    print(f'MAPE is : {mean_absolute_percentage_error(y_true, y_pred)}')
    print(f'R2 is : {metrics.r2_score(y_true, y_pred)}',end='\n\n')

In [13]:
def custom_ts_data_prep(dataset, target, start, end, window, horizon):
    X = []
    y = []
    start = start + window
    if end is None:
        end = len(dataset) - horizon
    for i in range(start, end):
        indices = range(i-window, i)
        X.append(dataset[indices])
        indicey = range(i+1, i+1+horizon)
        y.append(target[indicey])
    return np.array(X), np.array(y)

In [46]:
x_scaler = MinMaxScaler(feature_range=(-1,1))
y_scaler = MinMaxScaler(feature_range=(-1,1))
data_x = x_scaler.fit_transform(data)
data_y = y_scaler.fit_transform(data[['close']])

In [47]:
hist_window = 22
horizon = 1
TRAIN_SPLIT = int(len(data_x)*0.8)
x_train_multi, y_train_multi = custom_ts_data_prep(data_x, data_y, 0, TRAIN_SPLIT, hist_window, horizon)
x_val_multi, y_val_multi= custom_ts_data_prep(data_x, data_y, TRAIN_SPLIT, None, hist_window, horizon)
split = int(len(x_val_multi)*0.5)
x_val_multi, y_val_multi, x_test_multi, y_test_multi = x_val_multi[: split], y_val_multi[:split], x_val_multi[:-split], y_val_multi[:-split]

In [48]:
assert(x_train_multi.shape[0]  == y_train_multi.shape[0])
assert(x_val_multi.shape[0]  == y_val_multi.shape[0])

In [49]:
# x_train_multi[:1], y_train_multi[:1]

In [50]:
BATCH_SIZE = 128
BUFFER_SIZE = 150
train_data = tf.data.Dataset.from_tensor_slices((x_train_multi, y_train_multi))
train_data = train_data.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()
val_data = tf.data.Dataset.from_tensor_slices((x_val_multi, y_val_multi))
val_data = val_data.batch(BATCH_SIZE).repeat()

In [19]:
# x_train_multi.shape

In [37]:
lstm_model = Sequential([
    LSTM(40, input_shape=x_train_multi.shape[-2:], return_sequences=True),
    LSTM(units=40,return_sequences=True),
    Dropout(0.1),
    LSTM(units=40,return_sequences=True),
    Dropout(0.1),
    LSTM(units=20),
#     Dropout(0.1),
#     LSTM(units=40,return_sequences=True),
#     Dropout(0.1),
#     LSTM(units=40,return_sequences=True),
#     Dropout(0.1),
#     LSTM(units=40,return_sequences=True),
#     Dropout(0.1),
#     LSTM(units=40,return_sequences=True),
#     Dropout(0.1),
#     LSTM(units=40,return_sequences=True),
#     Dropout(0.1),
#     LSTM(units=40,return_sequences=True),
#     Dropout(0.1),
#     LSTM(units=40,return_sequences=True),
#     Dropout(0.1),
#     LSTM(units=40,return_sequences=True),
#     Dropout(0.1),
#     LSTM(units=40,return_sequences=True),
#     Dropout(0.1),
#     LSTM(units=40,return_sequences=True),
#     Dropout(0.1),
#     LSTM(units=15),
    Dense(units=1)
])
lstm_model.compile(optimizer='adam', loss='mse')

In [51]:
model_path = r'../models/LSTM_Multivariate.h5'

In [2854]:
EVALUATION_INTERVAL = 100
EPOCHS = 150
history = lstm_model.fit(train_data,
    epochs=EPOCHS,steps_per_epoch=EVALUATION_INTERVAL,validation_data=val_data, validation_steps=50,verbose =1,
    callbacks =[tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5,
    verbose=1, mode='min'),tf.keras.callbacks.ModelCheckpoint(model_path,monitor='val_loss', save_best_only=True, mode='min',
    verbose=0)])

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 00014: early stopping


In [40]:
model = tf.keras.models.load_model(model_path)

In [41]:
model.summary()

Model: "sequential_140"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_373 (LSTM)              (None, 22, 40)            8320      
_________________________________________________________________
lstm_374 (LSTM)              (None, 22, 40)            12960     
_________________________________________________________________
dropout_195 (Dropout)        (None, 22, 40)            0         
_________________________________________________________________
lstm_375 (LSTM)              (None, 22, 40)            12960     
_________________________________________________________________
dropout_196 (Dropout)        (None, 22, 40)            0         
_________________________________________________________________
lstm_376 (LSTM)              (None, 20)                4880      
_________________________________________________________________
dense_141 (Dense)            (None, 1)              

In [42]:
def show_model_loss_plot(history):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train loss', 'validation loss'], loc='upper left')
#     plt.rcParams["figure.figsize"] = [16,9]
    plt.show()

Evaluation metric results:-


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [2861]:
# np.std(y_val_), np.mean(y_val_) #(4.016922584171439, 28.52873134328358)

In [2862]:
# [y_scaler.inverse_transform(y[0].reshape(-1,1))[0][0] for y in y_val_multi]

In [34]:
def process(scaler, y_val_multi, 
            y_test_multi, 
            result_inv_trans,
            data, split):
    a = pd.DataFrame([scaler.inverse_transform(y[0].reshape(-1,1))[0][0] 
                      for y in y_val_multi], index=np.array(data.index)[-2*split:-split])
    b = pd.DataFrame([scaler.inverse_transform(y[0].reshape(-1,1))[0][0] 
                      for y in y_test_multi], index=data[-len(y_test_multi):].index)
    c = pd.DataFrame([scaler.inverse_transform(np.array(y[0]).reshape(-1,1))[0][0] 
                      for y in result_inv_trans], index=data[-len(result_inv_trans):].index)
    return a, b, c

In [30]:
def show_actual_vs_predicted_plot(a, b, c):    
    plt.plot(a )
    plt.plot(b)
    plt.plot(c)
    plt.title("Actual vs Predicted")
    plt.ylabel("Price Rate of Change")
    plt.legend(('val','Actual','predicted'))
    plt.show()

In [56]:
model_rnn = Sequential([
    SimpleRNN(15, input_shape=x_train_multi.shape[-2:], return_sequences=True),
    Dense(1)
])
model_rnn.compile(optimizer='adam',loss='mse')

In [57]:
def train_model(model, train_data, val_data=val_data, 
                epochs=100,steps_per_epoch=100,
               validation_steps=50, verbose=1, callbacks= None, model_path='../models/model.h5'):
    if callbacks == None:
        callbacks =[tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5,
        verbose=1, mode='min'),tf.keras.callbacks.ModelCheckpoint(model_path,monitor='val_loss', save_best_only=True, mode='min',
        verbose=0)]
    history = model.fit(train_data, epochs=epochs,
                        steps_per_epoch=steps_per_epoch,validation_data=val_data, 
                        validation_steps=validation_steps, verbose =verbose,
                        callbacks=callbacks)
    return history

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping


In [59]:
def lifecycle(model, data, split, train_data, x_test_multi, y_test_multi, y_scaler, y_val_multi):
    history = train_model(model, train_data)
    result_inv_trans = model.predict(x_test_multi)
    a, b, c = process(y_scaler, y_val_multi, y_test_multi, result_inv_trans, data, split)
    timeseries_evaluation_metrics_func(b.values, c.values)
    show_model_loss_plot(history)
    show_actual_vs_predicted_plot(a, b, c)