In [None]:
# Importing the libraries and removing the warning messages

import pandas as pd
import numpy as np
import warnings
import sys
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib import ticker
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from statsmodels.tsa.stattools import adfuller, acf, pacf
from sklearn.preprocessing import MinMaxScaler,StandardScaler

warnings.filterwarnings('ignore')

In [None]:
# Importing the scripts

sys.path.append("..")
from scripts.data_vizualization import Data_Viz 
from scripts.data_cleaning import DataCleaner
from scripts.data_transformation import DataTransformer
from scripts.stationary_check import check_stationary_mv, check_stationary_adf,corrPlots
from scripts.deep_learning import windowed_dataset,model_forecast

DC = DataCleaner()
DV = Data_Viz()
DT = DataTransformer()

In [None]:
# Importing and reading the data

train = pd.read_csv('../data/train_data_clean.csv')
train.info()

In [None]:
# Checking out the index and changing it from object to date format and sorting the data by date

train['Date'] = pd.to_datetime(train['Date'])
train = train.sort_values(by='Date')
train.head()

In [None]:
# Check if there is a missing value

DV.summ_columns(train)

###### From the above we can see that the train dataset has no missing value

In [None]:
# Creating a sales per day dataframe

sales_per_day = train[['Date','Sales']].groupby(['Date']).agg({'Sales':'mean'})

sales_per_day.head()

In [None]:
# Ploting the sales per day graph

DV.plot_line(sales_per_day, "Date", "Sales", [15, 8], "Sales per day", "sales_per_day.jpg")

##### From the above graph we can deduce that the data is more or less stationary with constant mean and std. But in the next section lets try to check its stationarity by using ADF

In [None]:
# Checking stationarity both by using ADF and mean value
check_stationary_mv(sales_per_day, "sales per days")
print('\n************')
check_stationary_adf(sales_per_day,"Sales", "sales per days")

#### From the above metrixes we can see that the p-value is <0.05 and the mean and std are constant so we can say that the data is stationary.

In [None]:
# plotting autocorrelation graph for sales per day
acf_days = acf(sales_per_day.Sales.values, fft=True, nlags=941)
acfNp = np.array(acf_days)
pacf_days = pacf(sales_per_day.Sales.values, nlags=200)
pacfNp = np.array(pacf_days)

corrPlots(acfNp, 'Simple')
corrPlots(pacfNp, "Paritial")

##### From the above graph also we can see that the autocorrelation is having a constant pattern so we can say that the data is stationary

### Scaling the data

In [None]:
# scaling the data from -1 to 1 for the model fitting

scaler = MinMaxScaler()
scaler.fit(sales_per_day.Sales.values.reshape([-1, 1]))
SalesScaled = scaler.transform(sales_per_day.Sales.values.reshape(-1, 1))
sales_per_day['SalesScaled'] = SalesScaled
sales_per_day.tail(10)

In [None]:
# Checking the min max value for the scaled sales data

sales_per_day.describe()

##### From the above description we can see that the data is successfuly scaled between -1 and 1 but in our case 0 to 1

### Training and Validation set separation

In [None]:
SIZE = len(sales_per_day.SalesScaled)
WINDOW_SIZE = 48
BATCH_SIZE= SIZE-WINDOW_SIZE*2
EPOCHS = 200

In [None]:
DateTrain = sales_per_day.index.values[0:BATCH_SIZE]
DateValid = sales_per_day.index.values[BATCH_SIZE:]
XTrain = sales_per_day.SalesScaled.values[0:BATCH_SIZE].astype('float32')
XValid = sales_per_day.SalesScaled.values[BATCH_SIZE:].astype('float32')

# Obtain shapes for vectors of size (,1) for dates series

DateTrain = np.reshape(DateTrain, (-1, 1))
DateValid = np.reshape(DateValid, (-1, 1))

print("Shape of the training set date series: ", DateTrain.shape)
print("Shape of the validation set date series: ", DateValid.shape)
print()
print("Shape of the training set logarithm of sales series: ", XTrain.shape)
print("Shape of the validation set logarithm of sales series in a stateless LSTM: ", XValid.shape)

In [None]:
tf.random.set_seed(1234)
# add extra dimension
series = tf.expand_dims(XTrain, axis=-1)
series.shape

In [None]:
# create tensor from each individual element

dataset = tf.data.Dataset.from_tensor_slices(series)
dataset

In [None]:
# takes a window_size + 1 chunk from the slices
dataset = dataset.window(WINDOW_SIZE + 1, shift=1, drop_remainder=True) 

In [None]:
dataset = dataset.flat_map(lambda window: window.batch(WINDOW_SIZE + 1))

In [None]:
dataset = dataset.map(lambda window: (window[:-1], window[-1:]))

In [None]:
dataset = dataset.batch(BATCH_SIZE).prefetch(1)

In [None]:
def windowed_dataset(series, window_size=WINDOW_SIZE, batch_size=BATCH_SIZE): 
  series = tf.expand_dims(series, axis=-1)
  dataset = tf.data.Dataset.from_tensor_slices(series)
  dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True) 
  dataset = dataset.flat_map(lambda window: window.batch(window_size + 1))
  dataset = dataset.map(lambda window: (window[:-1], window[-1:]))
  dataset = dataset.batch(batch_size).prefetch(1)
  return dataset

In [None]:
# Getting the train and validation dataset

DatasetTrain = windowed_dataset(XTrain)
DatasetVal = windowed_dataset(XValid)

In [None]:
# Creating the deep learning model

model = Sequential()
model.add(LSTM(8, input_shape=[None, 1], return_sequences=True))
model.add(LSTM(4, input_shape=[None, 1]))
model.add(Dense(1))
model.compile(loss="huber_loss", optimizer='adam')

In [None]:
# Checking out the model summary

model.summary()

In [None]:
# Checking out the model history and life cycle since its output is very long I removed the output and display the snippet in the next cell

history = model.fit(DatasetTrain, epochs=EPOCHS, validation_data=DatasetVal, verbose=1)

In [None]:
# The snippet of the above cell output

%matplotlib inline
from IPython.display import Image
Image('../charts/Capture.PNG')

In [None]:
# Plotting the loss plot

fig = plt.figure()
plt.plot(history.history['loss'], label="loss")
plt.plot(history.history['val_loss'], label="val_loss")
plt.legend()
plt.savefig('../charts/deep_learning_model_loss.jpg')
plt.show()

### Finding out the accuracy of the model by using the validation set

In [None]:
Forecast = model_forecast(model, sales_per_day.SalesScaled.values[:, np.newaxis], WINDOW_SIZE, SIZE)
Results = Forecast[BATCH_SIZE-WINDOW_SIZE:-1]
Results1 = scaler.inverse_transform(Results.reshape(-1,1))
XValid1 = scaler.inverse_transform(XValid.reshape(-1,1))

In [None]:
plt.figure(figsize=(30, 8))
plt.title("LSTM Model Forecast Compared to Validation Data")
plt.plot(DateValid.astype('datetime64'), Results1, label='Forecast series')
plt.plot(DateValid.astype('datetime64'), np.reshape(XValid1, (2*WINDOW_SIZE, 1)), label='Validation series')
plt.xlabel('Date')
plt.ylabel('Thousands of Units')
plt.xticks(DateValid.astype('datetime64')[:,-1], rotation = 90) 
plt.legend(loc="upper right")

MAE = tf.keras.metrics.mean_absolute_error(XValid1[:,-1], Results[:,-1]).numpy()
RMSE = np.sqrt(tf.keras.metrics.mean_squared_error(XValid1[:,-1], Results[:,-1]).numpy())

textstr = "MAE = " + "{:.3f}".format(MAE) + "  RMSE = " + "{:.3f}".format(RMSE)

# place a text box in upper left in axes coords
plt.annotate(textstr, xy=(0.87, 0.05), xycoords='axes fraction')
plt.grid(True)

plt.show()
print(textstr)

In [None]:
scaler = StandardScaler()
scaled_array = scaler.fit_transform(sales_per_day)
sales_per_day['DataScaled'] = scaled_array
BATCH_SIZE = len(sales_per_day) - (WINDOW_SIZE * 2)