# import libraries for data manipulation

In [1]:
#import keras
#from keras.layers import Dense
#from keras.models import Sequential
#from  keras.optimizers import Adam
#from  keras.callbacks import EarlyStopping
#from  keras.utils import np_utils
#from keras.layers import LTSM
#from sklearn.model_selection import KFold, cross_val_score, train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#to display the totalnumber of columns in the dataset
pd.set_option('display.max_columns', None)

# import the dataset

In [2]:
df_sales = pd.read_csv('./dataset/train.csv')

df_sales.head(10)

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
5,2013-01-06,1,1,12
6,2013-01-07,1,1,10
7,2013-01-08,1,1,9
8,2013-01-09,1,1,12
9,2013-01-10,1,1,9


# Our task is to forecast monthly total sales. We need to aggregate our data at the monthly level and sum up the sales column.

In [3]:
#represent month in date field as its first day
df_sales['date'] = pd.to_datetime(df_sales['date'])
df_sales['date'] = df_sales['date'].dt.year.astype('str') + '-' + df_sales['date'].dt.month.astype('str') + '-01'
#df_sales['date'] = pd.to_datetime(df_sales['date'])
#groupby date and sum the sales
df_sales = df_sales.groupby('date').sales.sum().reset_index()


# After applying the code above, df_sales is now showing the aggregated sales we need:

In [4]:
df_sales.head(5)

Unnamed: 0,date,sales
0,2013-1-01,454904
1,2013-10-01,656587
2,2013-11-01,692643
3,2013-12-01,506607
4,2013-2-01,459417


# Data Transformation
# To model our forecast easier and more accurate, we will do the transformations below:

- We will convert the data to stationary if it is not
- Converting from time series to supervised for having the feature set of our LSTM model
- Scale the data

In [None]:
#plot monthly sales
plot_data = [
    go.Scatter(
        x=df_sales['date'],
        y=df_sales['sales'],
    )
]
plot_layout = go.Layout(
        title='Montly Sales'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

NameError: name 'go' is not defined

# get the difference in sales compared to the previous month and build the model on it

In [6]:
#create a new dataframe to model the difference
df_diff = df_sales.copy()
#add previous sales to the next row
df_diff['prev_sales'] = df_diff['sales'].shift(1)
#drop the null values and calculate the difference
df_diff = df_diff.dropna()
df_diff['diff'] = (df_diff['sales'] - df_diff['prev_sales'])
#df_diff.head(10)

In [7]:
df_diff.head(10)

Unnamed: 0,date,sales,prev_sales,diff
1,2013-10-01,656587,454904.0,201683.0
2,2013-11-01,692643,656587.0,36056.0
3,2013-12-01,506607,692643.0,-186036.0
4,2013-2-01,459417,506607.0,-47190.0
5,2013-3-01,617382,459417.0,157965.0
6,2013-4-01,682274,617382.0,64892.0
7,2013-5-01,763242,682274.0,80968.0
8,2013-6-01,795597,763242.0,32355.0
9,2013-7-01,855922,795597.0,60325.0
10,2013-8-01,766761,855922.0,-89161.0


# Let’s plot it and check if it is stationary now:


In [8]:
#plot sales diff
plot_data = [
    go.Scatter(
        x=df_diff['date'],
        y=df_diff['diff'],
    )
]
plot_layout = go.Layout(
        title='Montly Sales Diff'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

NameError: name 'go' is not defined

# Perfect! Now we can start building our feature set. 
- We need to use previous monthly sales data to forecast the next ones. 
- The look-back period may vary for every model. Ours will be 12 for this example.
- So what we need to do is to create columns from lag_1 to lag_12 and assign values by using shift() method:

In [9]:
#create dataframe for transformation from time series to supervised
df_supervised = df_diff.drop(['prev_sales'],axis=1)
#adding lags
for inc in range(1,13):
    field_name = 'lag_' + str(inc)
    df_supervised[field_name] = df_supervised['diff'].shift(inc)
#drop null values
df_supervised = df_supervised.dropna().reset_index(drop=True)

# Check out our new dataframe called df_supervised

In [10]:
df_supervised.head(10)

Unnamed: 0,date,sales,diff,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12
0,2014-10-01,758883,232896.0,-163920.0,-76854.0,-89161.0,60325.0,32355.0,80968.0,64892.0,157965.0,-47190.0,-186036.0,36056.0,201683.0
1,2014-11-01,800783,41900.0,232896.0,-163920.0,-76854.0,-89161.0,60325.0,32355.0,80968.0,64892.0,157965.0,-47190.0,-186036.0,36056.0
2,2014-12-01,578048,-222735.0,41900.0,232896.0,-163920.0,-76854.0,-89161.0,60325.0,32355.0,80968.0,64892.0,157965.0,-47190.0,-186036.0
3,2014-2-01,529117,-48931.0,-222735.0,41900.0,232896.0,-163920.0,-76854.0,-89161.0,60325.0,32355.0,80968.0,64892.0,157965.0,-47190.0
4,2014-3-01,704301,175184.0,-48931.0,-222735.0,41900.0,232896.0,-163920.0,-76854.0,-89161.0,60325.0,32355.0,80968.0,64892.0,157965.0
5,2014-4-01,788914,84613.0,175184.0,-48931.0,-222735.0,41900.0,232896.0,-163920.0,-76854.0,-89161.0,60325.0,32355.0,80968.0,64892.0
6,2014-5-01,882877,93963.0,84613.0,175184.0,-48931.0,-222735.0,41900.0,232896.0,-163920.0,-76854.0,-89161.0,60325.0,32355.0,80968.0
7,2014-6-01,906842,23965.0,93963.0,84613.0,175184.0,-48931.0,-222735.0,41900.0,232896.0,-163920.0,-76854.0,-89161.0,60325.0,32355.0
8,2014-7-01,989010,82168.0,23965.0,93963.0,84613.0,175184.0,-48931.0,-222735.0,41900.0,232896.0,-163920.0,-76854.0,-89161.0,60325.0
9,2014-8-01,885596,-103414.0,82168.0,23965.0,93963.0,84613.0,175184.0,-48931.0,-222735.0,41900.0,232896.0,-163920.0,-76854.0,-89161.0


# We have our feature set now. Let’s be a bit more curious and ask this question:

# How useful are our features for prediction?

- Adjusted R-squared is the answer. 
- It tells us how good our features explain the variation in our label (lag_1 to lag_12 for diff, in our example).

Let’s see it in an example:
- using only one feature set

In [11]:
# Import statsmodels.formula.api
import statsmodels.formula.api as smf
# Define the regression formula
model = smf.ols(formula='diff ~ lag_1', data=df_supervised)
# Fit the regression
model_fit = model.fit()
# Extract the adjusted r-squared
regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)

-0.021681093175723642


# How is the score if we use the entire feature set?

In [12]:
# Import statsmodels.formula.api
import statsmodels.formula.api as smf
# Define the regression formula
model = smf.ols(formula='diff ~ lag_12', data=df_supervised)
# Fit the regression
model_fit = model.fit()
# Extract the adjusted r-squared
regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)

0.9860041281425063


# We should split our data into train and test sets. As the test set, we have selected the last 6 months’ sales.


In [13]:
#import MinMaxScaler and create a new dataframe for LSTM model
from sklearn.preprocessing import MinMaxScaler
df_model = df_supervised.drop(['sales','date'],axis=1)
#split train and test set
train_set, test_set = df_model[0:-6].values, df_model[-6:].values

# As the scaler, we are going to use MinMaxScaler, which will scale each future between -1 and 1:

In [14]:
#apply Min Max Scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_set)
# reshape training set
train_set = train_set.reshape(train_set.shape[0], train_set.shape[1])
train_set_scaled = scaler.transform(train_set)
# reshape test set
test_set = test_set.reshape(test_set.shape[0], test_set.shape[1])
test_set_scaled = scaler.transform(test_set)

# Building the LSTM model
- Everything is ready to build our first deep learning model. Let’s create feature and label sets from scaled datasets:

In [15]:
X_train, y_train = train_set_scaled[:, 1:], train_set_scaled[:, 0:1]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test, y_test = test_set_scaled[:, 1:], test_set_scaled[:, 0:1]
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

# Let’s fit our LSTM model:
- The code block below prints how the model improves itself and reduce the error in each epoch:

In [16]:
model = Sequential()
model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, nb_epoch=100, batch_size=1, verbose=1, shuffle=False)

NameError: name 'Sequential' is not defined

# Let’s do the prediction and see how the results look like:

In [17]:
y_pred = model.predict(X_test,batch_size=1)
#for multistep prediction, you need to replace X_test values with the predictions coming from t-1

TypeError: predict() got an unexpected keyword argument 'batch_size'

# Results look similar but it doesn’t tell us much because these are scaled data that shows the difference. 
- How we can see the actual sales prediction?
- First, we need to do the inverse transformation for scaling:

In [18]:
#reshape y_pred
y_pred = y_pred.reshape(y_pred.shape[0], 1, y_pred.shape[1])
#rebuild test set for inverse transform
pred_test_set = []
for index in range(0,len(y_pred)):
    print (np.concatenate([y_pred[index],X_test[index]],axis=1))
    pred_test_set.append(np.concatenate([y_pred[index],X_test[index]],axis=1))
#reshape pred_test_set
pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])
#inverse transform
pred_test_set_inverted = scaler.inverse_transform(pred_test_set)

NameError: name 'y_pred' is not defined

# Second, we need to build the dataframe has the dates and the predictions. 
- Transformed predictions are showing the difference. 
- We should calculate the predicted sales numbers:

In [19]:
#create dataframe that shows the predicted sales
result_list = []
sales_dates = list(df_sales[-7:].date)
act_sales = list(df_sales[-7:].sales)
for index in range(0,len(pred_test_set_inverted)):
    result_dict = {}
    result_dict['pred_value'] = int(pred_test_set_inverted[index][0] + act_sales[index])
    result_dict['date'] = sales_dates[index+1]
    result_list.append(result_dict)
df_result = pd.DataFrame(result_list)
#for multistep prediction, replace act_sales with the predicted sales

NameError: name 'pred_test_set_inverted' is not defined

# Great! We’ve predicted the next six months’ sales numbers. Let’s check them in the plot to see how good is our model:

In [20]:
#merge with actual sales dataframe
df_sales_pred = pd.merge(df_sales,df_result,on='date',how='left')
#plot actual and predicted
plot_data = [
    go.Scatter(
        x=df_sales_pred['date'],
        y=df_sales_pred['sales'],
        name='actual'
    ),
        go.Scatter(
        x=df_sales_pred['date'],
        y=df_sales_pred['pred_value'],
        name='predicted'
    )
    
]
plot_layout = go.Layout(
        title='Sales Prediction'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

NameError: name 'df_result' is not defined

# One improvement we can do for this model is to add holidays, breaks, and other seasonal effects. 
- They can be simply added as a new feature.
- By using this model, we have our baseline sales predictions. But how we can predict the effect of a promotion on sales? 