# Guides for Brides

## 1. Prepare Problem and Explore Data

### Environment Setup

In [None]:
# Keras and Tensorflow installation

!pip install --ignore-installed --upgrade tensorflow==1.6.0
!pip install keras==2.1.5

In [None]:
# Import Libraries

import pandas as pd
import numpy as np
import random
from datetime import datetime
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from pandas.tseries.offsets import DateOffset
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math
from math import sqrt

# Import for visualisation

import matplotlib.pyplot as plt
from plotly import tools
import plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
import plotly.express as px
pyoff.init_notebook_mode()
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# Do not show warnings

import warnings
warnings.filterwarnings("ignore")

# Import Keras

from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras import optimizers
from keras.optimizers import Adam 
from keras.layers import LSTM, GRU, SimpleRNN
from keras.preprocessing.sequence import TimeseriesGenerator

print ('Import completed')

In [None]:
# Load Datasets

df_users = pd.read_csv('irinie_users_22-06-2020.csv')
print('There are {} rows, {} columns in the users dataset.'.format(len(df_users),
                                                             len(df_users.columns)))

df_budgets = pd.read_csv('irinie_budgets_22-06-2020.csv')
print('There are {} rows, {} columns in the first budgets dataset.'.format(len(df_budgets),
                                                             len(df_budgets.columns)))

df_budgets2 = pd.read_csv('irinie_budgets.csv')
print('There are {} rows, {} columns in the second budgets dataset.'.format(len(df_budgets2),
                                                             len(df_budgets2.columns)))

df_tot_budget = pd.read_csv('irinie_budget_totals_22-06-2020.csv')
print('There are {} rows, {} columns in the total wedding budget dataset.'.format(len(df_tot_budget),
                                                             len(df_tot_budget.columns)))

### Data Inspection

In [None]:
df_users.columns

The identifier 'id' in the users table matches 'qianqian_user_id' in the other tables. All ID's will be renamed to 'user_id' for simplicity and consistency purposes.

In [None]:
df_users.rename(columns={'id':'user_id'}, inplace=True)

In [None]:
df_budgets.columns

In [None]:
df_budgets2.columns

In [None]:
df_budgets.rename(columns={'qianqian_user_id':'user_id'}, inplace=True)

In [None]:
df_budgets2.rename(columns={'qianqian_user_id':'user_id'}, inplace=True)

In [None]:
df_tot_budget.columns

In [None]:
df_tot_budget.rename(columns={'qianqian_user_id':'user_id'}, inplace=True)

The concept behind 'budget' differs in the 'df_tot_budget' and 'df_budget' tables, so will be renamed in the 'df_tot_budget' table to distinguish between the two.

In [None]:
df_tot_budget.rename(columns = {'budget':'total_budget'}, inplace = True)

### Data Preprocessing

#### User Table

In [None]:
df_users = df_users.loc[:,['user_id', 'source', 'engagement_date', 'wedding_year', 
                           'wedding_month', 'wedding_day','created_at']]
df_users.head()

In [None]:
#Visualise the completeness of dataframe 
sns.heatmap(df_users.isnull(), cbar=False, yticklabels=False)

In [None]:
print(f'Missing values in each variable: \n{df_users.isnull().sum()}')

#### Budgets Table

We have two budgets tables which span from differing points of time. We shall therefore append the tables to get just one table.

In [None]:
# Convert date field from string to datetime

df_budgets2['created_at'] =  pd.to_datetime(df_budgets2['created_at'])

In [None]:
df_budgets2 = df_budgets2[(df_budgets2.created_at < datetime(2020,3,1)) 
                             & (df_budgets2.created_at >= datetime(2013,9,1))]

In [None]:
df_budgets2.created_at.max()

In [None]:
df_budgets['created_at'] =  pd.to_datetime(df_budgets['created_at'])

In [None]:
df_budgets.created_at.max()

In [None]:
# Append budgets tables

df_budgets = df_budgets.append(df_budgets2, ignore_index = True)

In [None]:
df_budgets = df_budgets.loc[:,['planner_supplier_id', 'user_id', 'business_category',
       'budget', 'quote', 'deposit', 'booked', 'deposit_paid', 'balance_paid','created_at','updated_at']]
df_budgets.head()

In [None]:
print('There are {} rows, {} columns in the complete budgets dataset.'.format(len(df_budgets),
                                                             len(df_budgets.columns)))

Variables 'booked', 'deposit_paid' and 'balance_paid' will be amended to numerical dummy variables for ease in data analysis.

In [None]:
df_budgets['booked'] = pd.Series(np.where(df_budgets.booked.values == 'yes', 1, 0),
          df_budgets.index)

In [None]:
df_budgets['deposit_paid'] = pd.Series(np.where(df_budgets.deposit_paid.values == 'yes', 1, 0),
          df_budgets.index)

In [None]:
df_budgets['balance_paid'] = pd.Series(np.where(df_budgets.balance_paid.values == 'yes', 1, 0),
          df_budgets.index)

In [None]:
df_budgets.head()

In [None]:
df_budgets = df_budgets[df_budgets['budget'] <= 32000]  

In [None]:
len(df_budgets)

In [None]:
df_budgets.booked.value_counts()

In [None]:
df_budgets.deposit_paid.value_counts()

In [None]:
df_budgets.balance_paid.value_counts()

In [None]:
#Visualise the completeness of dataframe 
sns.heatmap(df_budgets.isnull(), cbar=False, yticklabels=False);

In [None]:
print(f'Missing values in each variable: \n{df_budgets.isnull().sum()}')

## 2. Visualise Data

In [None]:
def statistics(variable):
    if variable.dtype == 'int64' or variable.dtype == 'float64':
        return pd.DataFrame([[variable.name, np.mean(variable), np.std(variable), np.median(variable), np.var(variable)]], 
                            columns = ['Variable', 'Mean', 'Standard Deviation', 'Median', 'Variance']).set_index('Variable')
    else:
        return pd.DataFrame(variable.value_counts())

In [None]:
fig = px.histogram(df_users, x = 'wedding_month', nbins = 20)
fig.update_layout(title = 'Wedding Month Distribution')
fig.show()

In [None]:
wedding_month = df_users['wedding_month']
statistics(wedding_month)

In [None]:
fig = px.histogram(df_users, x = 'wedding_day', nbins = 20)
fig.update_layout(title = 'Wedding Day Distribution')
fig.show()

In [None]:
wedding_day = df_users['wedding_day']
statistics(wedding_day)

In [None]:
source_category = round(df_users['source'].value_counts(normalize = True) * 100, 3)
source_category = pd.DataFrame({'source':source_category.index, 'percentage':source_category.values})
source_category.head()

In [None]:
fig = px.bar(source_category, x = 'source', y = 'percentage')
fig.update_layout(title = 'User\'s First Interaction')
fig.show()

In [None]:
booked = round(df_budgets['booked'].value_counts(normalize = True) * 100,)
booked = pd.DataFrame({'booked':booked.index, 'percentage':booked.values})
booked.head()

In [None]:
fig = px.bar(booked, x = 'booked', y = 'percentage')
fig.update_layout(title = 'Booking Distribution')
fig.show()

In [None]:
supplier_category = round(df_budgets['business_category'].value_counts(normalize = True) * 100, 3)
supplier_category = pd.DataFrame({'business_category':supplier_category.index, 'percentage':supplier_category.values})
supplier_category.head()

In [None]:
fig = px.bar(supplier_category, x = 'business_category', y = 'percentage')
fig.update_layout(title = 'Supplier Usage')
fig.show()

In [None]:
df = px.data.iris() # iris is a pandas DataFrame
fig = px.scatter(df_budgets, x = 'business_category', y = 'budget')
fig.update_layout(title = 'User Budgets per Category')
fig.show()

## 3. Predict Bookings

### Data Preparation

In [None]:
# Retrieve user bookings
df_budgets = df_budgets.loc[df_budgets['booked'] == 1]
df_budgets.head()

In [None]:
len(df_budgets)

In [None]:
df_budgets.rename(columns = {'created_at':'date'}, inplace = True)

In [None]:
# Represent month in date field as its first day
df_budgets['date'] = df_budgets['date'].dt.year.astype('str') + '-' + df_budgets['date'].dt.month.astype('str') + '-01'
df_budgets['date'] = pd.to_datetime(df_budgets['date'])

# Groupby date and sum the sales
df_budgets = df_budgets.groupby('date').booked.sum().reset_index()

In [None]:
df_budgets.head()

In [None]:
#plot monthly sales
plot_data = [
    go.Scatter(
        x = df_budgets['date'],
        y = df_budgets['booked'],
    )
]
plot_layout = go.Layout(
        title = 'User Bookings Overtime'
    )
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

In [None]:
#create a new dataframe to model the difference
df_diff = df_budgets.copy()
#add previous sales to the next row
df_diff['prev_sales'] = df_diff['booked'].shift(1)
#drop the null values and calculate the difference
df_diff = df_diff.dropna()
df_diff['diff'] = (df_diff['booked'] - df_diff['prev_sales'])
df_diff.head(10)

In [None]:
#plot sales diff
plot_data = [
    go.Scatter(
        x = df_diff['date'],
        y = df_diff['diff'],
    )
]
plot_layout = go.Layout(
        title = 'Monthly Bookings Diff'
    )
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

In [None]:
# Create dataframe for transformation from time series to supervised

df_supervised = df_diff.drop(['prev_sales'],axis = 1)

# Adding lags

for i in range(1,28):
    field_name = 'lag_' + str(i)
    df_supervised[field_name] = df_supervised['diff'].shift(i)
    
#Drop null values

df_supervised = df_supervised.dropna().reset_index(drop = True)

In [None]:
df_supervised

In [None]:
# Import statsmodels.formula.api

import statsmodels.formula.api as smf

# Define the regression formula

model = smf.ols(formula='diff ~ lag_1 + lag_2 + lag_3 + lag_4 + lag_5 + lag_6 + lag_7 + lag_8 + lag_9 + lag_10 + lag_11 + lag_12 + lag_13 + lag_14 + lag_15 + lag_16 + lag_17 + lag_18 + lag_19 + lag_20 + lag_21 + lag_22 + lag_23', data =df_supervised)

# Fit the regression

model_fit = model.fit()

# Extract the adjusted r-squared

regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)

In [None]:
regression_rsq = model_fit.rsquared
print(regression_rsq)

In [None]:
# Import MinMaxScaler and create a new dataframe for LSTM model

df_model = df_supervised.drop(['booked'],axis = 1)
df_model = df_model.set_index('date')

In [None]:
# Split train and test set

train_set, test_set = df_model[0:-6].values, df_model[-6:].values

In [None]:
# Apply Min Max Scaler

scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_set)

In [None]:
# Reshape training set

train_set = train_set.reshape(train_set.shape[0], train_set.shape[1])
train_set_scaled = scaler.transform(train_set)

# Reshape test set

test_set = test_set.reshape(test_set.shape[0], test_set.shape[1])
test_set_scaled = scaler.transform(test_set)

In [None]:
X_train, y_train = train_set_scaled[:, 1:], train_set_scaled[:, 0:1]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test, y_test = test_set_scaled[:, 1:], test_set_scaled[:, 0:1]
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

#### Simple RNN (SGD)

In [None]:
# Build Model

np.random.seed(1337)
print('Building model...')
model = Sequential()
model.add(SimpleRNN(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful = True))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('relu'))
model.compile(loss = 'mean_squared_error', optimizer = 'SGD', metrics = ['mse'])

In [None]:
history = model.fit(X_train, y_train, epochs = 10, batch_size = 1, validation_data = (X_test, y_test), verbose = 0, shuffle = False)

In [None]:
# Plot history: MSE

plt.plot(history.history['mean_squared_error'], label='MSE (training data)')
plt.plot(history.history['val_mean_squared_error'], label='MSE (validation data)')
plt.ylabel('MSE value')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")
plt.show()

In [None]:
test_pred = model.predict(X_test,batch_size=1)
train_pred = model.predict(X_train,batch_size=1)

In [None]:
# Seeing the booking prediction
# Reshape y_pred and train_pred

test_pred_inv = test_pred .reshape(test_pred .shape[0], 1, test_pred .shape[1])
train_pred_inv = train_pred.reshape(train_pred.shape[0], 1, train_pred.shape[1])

In [None]:
# Rebuild test set for inverse transform

pred_test_set = []
for index in range(0,len(test_pred )):
    np.concatenate([test_pred_inv[index],X_test[index]],axis = 1)
    pred_test_set.append(np.concatenate([test_pred_inv[index],X_test[index]],axis = 1))

In [None]:
# Rebuild train set for inverse transform

pred_train_set = []
for index in range(0,len(train_pred)):
    np.concatenate([train_pred_inv[index],X_train[index]],axis = 1)
    pred_train_set.append(np.concatenate([train_pred_inv[index],X_train[index]],axis = 1))

In [None]:
# Reshape pred_test_set and pred_train_set

pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])
pred_train_set = np.array(pred_train_set)
pred_train_set = pred_train_set.reshape(pred_train_set.shape[0], pred_train_set.shape[2])

In [None]:
#Inverse transform

pred_test_set_inverted = scaler.inverse_transform(pred_test_set)
pred_train_set_inverted = scaler.inverse_transform(pred_train_set)

In [None]:
df_budgets_new = df_budgets[28:53].reset_index(drop=True)

In [None]:
# Create dataframe that shows the predicted sales (test)

result_list = []
booking_dates = list(df_budgets_new[-6:].date)
act_bookings = list(df_budgets_new[-6:].booked)
for i in range(0,len(pred_test_set_inverted)):
    result_dict = {}
    result_dict['pred_value'] = int(pred_test_set_inverted[i][0] + act_bookings[i])
    result_dict['date'] = booking_dates[i]
    result_list.append(result_dict)
df_result = pd.DataFrame(result_list)

In [None]:
train_list = df_budgets[28:47].reset_index(drop=True)

In [None]:
# Create dataframe that shows the predicted sales (train)

result_list2 = []
booking_dates2 = list(train_list.date)
act_bookings2 = list(train_list.booked)
for i in range(0,len(pred_train_set_inverted)):
    result_dict2 = {}
    result_dict2['pred_value'] = int(pred_train_set_inverted[i][0] + act_bookings2[i])
    result_dict2['date'] = booking_dates2[i]
    result_list2.append(result_dict2)
df_result2 = pd.DataFrame(result_list2)

In [None]:
# Merge with actual sales dataframe

df_budgets_test = pd.merge(df_budgets_new,df_result,on = 'date',how = 'left')
df_budgets_train = pd.merge(df_budgets_new,df_result2,on = 'date',how = 'left')

In [None]:
#Plot actual and predicted

plot_data = [
    go.Scatter(
        x = df_budgets_new['date'],
        y = df_budgets_new['booked'],
        name='Actual'
    ),
        go.Scatter(
        x = df_budgets_test['date'],
        y = df_budgets_test['pred_value'],
        name = 'Test Prediction'
    ),
        go.Scatter(
        x = df_budgets_train['date'],
        y = df_budgets_train['pred_value'],
        name = 'Train Prediction'
    )
    
]

plot_layout = go.Layout(
        title = 'Booking Prediction Using SimpleRNN (SGD)'
    )
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

In [None]:
# Calculate RMSE and MAE

def return_rmse(test,predicted):
    math.sqrt(mean_squared_error(test, predicted))
    
def return_mae(test,predicted):
    math.sqrt(mean_absolute_error(test, predicted))

In [None]:
rmse = sqrt(mean_squared_error(y_test, test_pred))
print('Test RMSE: %.3f' % rmse)

mae = sqrt(mean_absolute_error(y_test, test_pred))
print('Test MAE: %.3f' % mae)

In [None]:
rmse = sqrt(mean_squared_error(y_train, train_pred))
print('Train RMSE: %.3f' % rmse)

mae = sqrt(mean_absolute_error(y_train, train_pred))
print('Train MAE: %.3f' % mae)

#### Simple RNN (Adam)

In [None]:
# Build Model

np.random.seed(1337)
print('Building model...')
model = Sequential()
model.add(SimpleRNN(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful = True))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('relu'))
model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ['mse'])

In [None]:
history = model.fit(X_train, y_train, epochs = 10, batch_size = 1, validation_data = (X_test, y_test), verbose = 0, shuffle = False)

In [None]:
# Plot history: MSE

plt.plot(history.history['mean_squared_error'], label='MSE (training data)')
plt.plot(history.history['val_mean_squared_error'], label='MSE (validation data)')
plt.ylabel('MSE value')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")
plt.show()

In [None]:
test_pred = model.predict(X_test,batch_size=1)
train_pred = model.predict(X_train,batch_size=1)

In [None]:
# Seeing the booking prediction
# Reshape y_pred and train_pred

test_pred_inv = test_pred .reshape(test_pred .shape[0], 1, test_pred .shape[1])
train_pred_inv = train_pred.reshape(train_pred.shape[0], 1, train_pred.shape[1])

In [None]:
# Rebuild test set for inverse transform

pred_test_set = []
for index in range(0,len(test_pred )):
    np.concatenate([test_pred_inv[index],X_test[index]],axis = 1)
    pred_test_set.append(np.concatenate([test_pred_inv[index],X_test[index]],axis = 1))

In [None]:
# Rebuild train set for inverse transform

pred_train_set = []
for index in range(0,len(train_pred)):
    np.concatenate([train_pred_inv[index],X_train[index]],axis = 1)
    pred_train_set.append(np.concatenate([train_pred_inv[index],X_train[index]],axis = 1))

In [None]:
# Reshape pred_test_set and pred_train_set

pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])
pred_train_set = np.array(pred_train_set)
pred_train_set = pred_train_set.reshape(pred_train_set.shape[0], pred_train_set.shape[2])

In [None]:
# Inverse transform

pred_test_set_inverted = scaler.inverse_transform(pred_test_set)
pred_train_set_inverted = scaler.inverse_transform(pred_train_set)

In [None]:
# Create dataframe that shows the predicted sales (test)

result_list = []
booking_dates = list(df_budgets_new[-6:].date)
act_bookings = list(df_budgets_new[-6:].booked)
for i in range(0,len(pred_test_set_inverted)):
    result_dict = {}
    result_dict['pred_value'] = int(pred_test_set_inverted[i][0] + act_bookings[i])
    result_dict['date'] = booking_dates[i]
    result_list.append(result_dict)
df_result = pd.DataFrame(result_list)

In [None]:
# Create dataframe that shows the predicted sales (train)

result_list2 = []
booking_dates2 = list(train_list.date)
act_bookings2 = list(train_list.booked)
for i in range(0,len(pred_train_set_inverted)):
    result_dict2 = {}
    result_dict2['pred_value'] = int(pred_train_set_inverted[i][0] + act_bookings2[i])
    result_dict2['date'] = booking_dates2[i]
    result_list2.append(result_dict2)
df_result2 = pd.DataFrame(result_list2)

In [None]:
# Merge with actual sales dataframe

df_budgets_test2 = pd.merge(df_budgets_new,df_result,on = 'date',how = 'left')
df_budgets_train2 = pd.merge(df_budgets_new,df_result2,on = 'date',how = 'left')

In [None]:
# Plot actual and predicted

plot_data = [
    go.Scatter(
        x = df_budgets_new['date'],
        y = df_budgets_new['booked'],
        name='Actual'
    ),
        go.Scatter(
        x = df_budgets_test2['date'],
        y = df_budgets_test2['pred_value'],
        name = 'Test Prediction'
    ),
        go.Scatter(
        x = df_budgets_train2['date'],
        y = df_budgets_train2['pred_value'],
        name = 'Train Prediction'
    )
    
]

plot_layout = go.Layout(
        title = 'Booking Prediction Using Simple RNN (Adam)'
    )
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

In [None]:
rmse = sqrt(mean_squared_error(y_test, test_pred))
print('Test RMSE: %.3f' % rmse)

mae = sqrt(mean_absolute_error(y_test, test_pred))
print('Test MAE: %.3f' % mae)

In [None]:
rmse = sqrt(mean_squared_error(y_train, train_pred))
print('Train RMSE: %.3f' % rmse)

mae = sqrt(mean_absolute_error(y_train, train_pred))
print('Train MAE: %.3f' % mae)

#### LSTM (SGD)

In [None]:
# Build Model

np.random.seed(1337)
print('Building model...')
model = Sequential()
model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful=True))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('relu'))
model.compile(loss = 'mean_squared_error', optimizer = 'SGD', metrics = ['mse'])

In [None]:
history = model.fit(X_train, y_train, epochs = 10, batch_size = 1, validation_data = (X_test, y_test), verbose = 0, shuffle = False)

In [None]:
# Plot history: MSE

plt.plot(history.history['mean_squared_error'], label='MSE (training data)')
plt.plot(history.history['val_mean_squared_error'], label='MSE (validation data)')
plt.ylabel('MSE value')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")
plt.show()

In [None]:
test_pred = model.predict(X_test,batch_size=1)
train_pred = model.predict(X_train,batch_size=1)

In [None]:
# Seeing the booking prediction
# Reshape y_pred and train_pred

test_pred_inv = test_pred .reshape(test_pred .shape[0], 1, test_pred .shape[1])
train_pred_inv = train_pred.reshape(train_pred.shape[0], 1, train_pred.shape[1])

In [None]:
# Rebuild test set for inverse transform

pred_test_set = []
for index in range(0,len(test_pred )):
    np.concatenate([test_pred_inv[index],X_test[index]],axis = 1)
    pred_test_set.append(np.concatenate([test_pred_inv[index],X_test[index]],axis = 1))

In [None]:
# Rebuild train set for inverse transform

pred_train_set = []
for index in range(0,len(train_pred)):
    np.concatenate([train_pred_inv[index],X_train[index]],axis = 1)
    pred_train_set.append(np.concatenate([train_pred_inv[index],X_train[index]],axis = 1))

In [None]:
# Reshape pred_test_set and pred_train_set

pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])
pred_train_set = np.array(pred_train_set)
pred_train_set = pred_train_set.reshape(pred_train_set.shape[0], pred_train_set.shape[2])

In [None]:
# Inverse transform

pred_test_set_inverted = scaler.inverse_transform(pred_test_set)
pred_train_set_inverted = scaler.inverse_transform(pred_train_set)

In [None]:
# Create dataframe that shows the predicted sales (test)

result_list = []
booking_dates = list(df_budgets_new[-6:].date)
act_bookings = list(df_budgets_new[-6:].booked)
for i in range(0,len(pred_test_set_inverted)):
    result_dict = {}
    result_dict['pred_value'] = int(pred_test_set_inverted[i][0] + act_bookings[i])
    result_dict['date'] = booking_dates[i]
    result_list.append(result_dict)
df_result = pd.DataFrame(result_list)

In [None]:
# Create dataframe that shows the predicted sales (train)

result_list2 = []
booking_dates2 = list(train_list.date)
act_bookings2 = list(train_list.booked)
for i in range(0,len(pred_train_set_inverted)):
    result_dict2 = {}
    result_dict2['pred_value'] = int(pred_train_set_inverted[i][0] + act_bookings2[i])
    result_dict2['date'] = booking_dates2[i]
    result_list2.append(result_dict2)
df_result2 = pd.DataFrame(result_list2)

In [None]:
# Merge with actual sales dataframe

df_budgets_test3 = pd.merge(df_budgets_new,df_result,on = 'date',how = 'left')
df_budgets_train3 = pd.merge(df_budgets_new,df_result2,on = 'date',how = 'left')

In [None]:
# Plot actual and predicted

plot_data = [
    go.Scatter(
        x = df_budgets_new['date'],
        y = df_budgets_new['booked'],
        name='Actual'
    ),
        go.Scatter(
        x = df_budgets_test3['date'],
        y = df_budgets_test3['pred_value'],
        name = 'Test Prediction'
    ),
        go.Scatter(
        x = df_budgets_train3['date'],
        y = df_budgets_train3['pred_value'],
        name = 'Train Prediction'
    )
    
]

plot_layout = go.Layout(
        title = 'Booking Prediction Using LSTM (SGD)'
    )
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

In [None]:
rmse = sqrt(mean_squared_error(y_test, test_pred))
print('Test RMSE: %.3f' % rmse)

mae = sqrt(mean_absolute_error(y_test, test_pred))
print('Test MAE: %.3f' % mae)

In [None]:
rmse = sqrt(mean_squared_error(y_train, train_pred))
print('Train RMSE: %.3f' % rmse)

mae = sqrt(mean_absolute_error(y_train, train_pred))
print('Train MAE: %.3f' % mae)

#### LSTM (Adam)

In [None]:
# Build Model

np.random.seed(1337)
print('Building model...')
model = Sequential()
model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful = True))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('relu'))
model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ['mse'])

In [None]:
history = model.fit(X_train, y_train, epochs = 10, batch_size = 1, validation_data = (X_test, y_test), verbose = 0, shuffle = False)

In [None]:
# Plot history: MSE

plt.plot(history.history['mean_squared_error'], label='MSE (training data)')
plt.plot(history.history['val_mean_squared_error'], label='MSE (validation data)')
plt.ylabel('MSE value')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")
plt.show()

In [None]:
test_pred = model.predict(X_test,batch_size=1)
train_pred = model.predict(X_train,batch_size=1)

In [None]:
# Seeing the booking prediction
# Reshape y_pred and train_pred

test_pred_inv = test_pred .reshape(test_pred .shape[0], 1, test_pred .shape[1])
train_pred_inv = train_pred.reshape(train_pred.shape[0], 1, train_pred.shape[1])

In [None]:
# Rebuild test set for inverse transform

pred_test_set = []
for index in range(0,len(test_pred )):
    np.concatenate([test_pred_inv[index],X_test[index]],axis = 1)
    pred_test_set.append(np.concatenate([test_pred_inv[index],X_test[index]],axis = 1))

In [None]:
# Rebuild train set for inverse transform

pred_train_set = []
for index in range(0,len(train_pred)):
    np.concatenate([train_pred_inv[index],X_train[index]],axis = 1)
    pred_train_set.append(np.concatenate([train_pred_inv[index],X_train[index]],axis = 1))

In [None]:
# Reshape pred_test_set and pred_train_set

pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])
pred_train_set = np.array(pred_train_set)
pred_train_set = pred_train_set.reshape(pred_train_set.shape[0], pred_train_set.shape[2])

In [None]:
# Inverse transform

pred_test_set_inverted = scaler.inverse_transform(pred_test_set)
pred_train_set_inverted = scaler.inverse_transform(pred_train_set)

In [None]:
# Create dataframe that shows the predicted sales (test)

result_list = []
booking_dates = list(df_budgets_new[-6:].date)
act_bookings = list(df_budgets_new[-6:].booked)
for i in range(0,len(pred_test_set_inverted)):
    result_dict = {}
    result_dict['pred_value'] = int(pred_test_set_inverted[i][0] + act_bookings[i])
    result_dict['date'] = booking_dates[i]
    result_list.append(result_dict)
df_result = pd.DataFrame(result_list)

In [None]:
# Create dataframe that shows the predicted sales (train)

result_list2 = []
booking_dates2 = list(train_list.date)
act_bookings2 = list(train_list.booked)
for i in range(0,len(pred_train_set_inverted)):
    result_dict2 = {}
    result_dict2['pred_value'] = int(pred_train_set_inverted[i][0] + act_bookings2[i])
    result_dict2['date'] = booking_dates2[i]
    result_list2.append(result_dict2)
df_result2 = pd.DataFrame(result_list2)

In [None]:
# Merge with actual sales dataframe

df_budgets_test4 = pd.merge(df_budgets_new,df_result,on = 'date',how = 'left')
df_budgets_train4 = pd.merge(df_budgets_new,df_result2,on = 'date',how = 'left')

In [None]:
# Plot actual and predicted

plot_data = [
    go.Scatter(
        x = df_budgets_new['date'],
        y = df_budgets_new['booked'],
        name='Actual'
    ),
        go.Scatter(
        x = df_budgets_test4['date'],
        y = df_budgets_test4['pred_value'],
        name = 'Test Prediction'
    ),
        go.Scatter(
        x = df_budgets_train4['date'],
        y = df_budgets_train4['pred_value'],
        name = 'Train Prediction'
    )
    
]

plot_layout = go.Layout(
        title = 'Booking Prediction Using LSTM (Adam)'
    )
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

In [None]:
rmse = sqrt(mean_squared_error(y_test, test_pred))
print('Test RMSE: %.3f' % rmse)

mae = sqrt(mean_absolute_error(y_test, test_pred))
print('Test MAE: %.3f' % mae)

In [None]:
rmse = sqrt(mean_squared_error(y_train, train_pred))
print('Train RMSE: %.3f' % rmse)

mae = sqrt(mean_absolute_error(y_train, train_pred))
print('Train MAE: %.3f' % mae)

#### GRU (SGD)

In [None]:
np.random.seed(1337)
print('Building model...')
model = Sequential()
model.add(GRU(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful = True))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('relu'))
model.compile(loss = 'mean_squared_error', optimizer = 'SGD', metrics = ['mse'])

In [None]:
history = model.fit(X_train, y_train, epochs = 10, batch_size = 1, validation_data = (X_test, y_test), verbose = 0, shuffle = False)

In [None]:
# Plot history: MSE

plt.plot(history.history['mean_squared_error'], label='MSE (training data)')
plt.plot(history.history['val_mean_squared_error'], label='MSE (validation data)')
plt.ylabel('MSE value')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")
plt.show()

In [None]:
test_pred = model.predict(X_test,batch_size=1)
train_pred = model.predict(X_train,batch_size=1)

In [None]:
# Seeing the booking prediction
# Reshape y_pred and train_pred

test_pred_inv = test_pred .reshape(test_pred .shape[0], 1, test_pred .shape[1])
train_pred_inv = train_pred.reshape(train_pred.shape[0], 1, train_pred.shape[1])

In [None]:
# Rebuild test set for inverse transform

pred_test_set = []
for index in range(0,len(test_pred )):
    np.concatenate([test_pred_inv[index],X_test[index]],axis = 1)
    pred_test_set.append(np.concatenate([test_pred_inv[index],X_test[index]],axis = 1))

In [None]:
# Rebuild train set for inverse transform

pred_train_set = []
for index in range(0,len(train_pred)):
    np.concatenate([train_pred_inv[index],X_train[index]],axis = 1)
    pred_train_set.append(np.concatenate([train_pred_inv[index],X_train[index]],axis = 1))

In [None]:
# Reshape pred_test_set and pred_train_set

pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])
pred_train_set = np.array(pred_train_set)
pred_train_set = pred_train_set.reshape(pred_train_set.shape[0], pred_train_set.shape[2])

In [None]:
# Inverse transform

pred_test_set_inverted = scaler.inverse_transform(pred_test_set)
pred_train_set_inverted = scaler.inverse_transform(pred_train_set)

In [None]:
# Create dataframe that shows the predicted sales (test)

result_list = []
booking_dates = list(df_budgets_new[-6:].date)
act_bookings = list(df_budgets_new[-6:].booked)
for i in range(0,len(pred_test_set_inverted)):
    result_dict = {}
    result_dict['pred_value'] = int(pred_test_set_inverted[i][0] + act_bookings[i])
    result_dict['date'] = booking_dates[i]
    result_list.append(result_dict)
df_result = pd.DataFrame(result_list)

In [None]:
# Create dataframe that shows the predicted sales (train)

result_list2 = []
booking_dates2 = list(train_list.date)
act_bookings2 = list(train_list.booked)
for i in range(0,len(pred_train_set_inverted)):
    result_dict2 = {}
    result_dict2['pred_value'] = int(pred_train_set_inverted[i][0] + act_bookings2[i])
    result_dict2['date'] = booking_dates2[i]
    result_list2.append(result_dict2)
df_result2 = pd.DataFrame(result_list2)

In [None]:
# Merge with actual sales dataframe

df_budgets_test5 = pd.merge(df_budgets_new,df_result,on = 'date',how = 'left')
df_budgets_train5 = pd.merge(df_budgets_new,df_result2,on = 'date',how = 'left')

In [None]:
# Plot actual and predicted

plot_data = [
    go.Scatter(
        x = df_budgets_new['date'],
        y = df_budgets_new['booked'],
        name='Actual'
    ),
        go.Scatter(
        x = df_budgets_test5['date'],
        y = df_budgets_test5['pred_value'],
        name = 'Test Prediction'
    ),
        go.Scatter(
        x = df_budgets_train5['date'],
        y = df_budgets_train5['pred_value'],
        name = 'Train Prediction'
    )
    
]

plot_layout = go.Layout(
        title = 'Booking Prediction Using GRU (SGD)'
    )
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

In [None]:
rmse = sqrt(mean_squared_error(y_test, test_pred))
print('Test RMSE: %.3f' % rmse)

mae = sqrt(mean_absolute_error(y_test, test_pred))
print('Test MAE: %.3f' % mae)

In [None]:
rmse = sqrt(mean_squared_error(y_train, train_pred))
print('Train RMSE: %.3f' % rmse)

mae = sqrt(mean_absolute_error(y_train, train_pred))
print('Train MAE: %.3f' % mae)

#### GRU (Adam)

In [None]:
np.random.seed(1337)
print('Building model...')
model = Sequential()
model.add(GRU(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful = True))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('relu'))
model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ['mse'])

In [None]:
history = model.fit(X_train, y_train, epochs = 10, batch_size = 1, validation_data = (X_test, y_test), verbose = 0, shuffle = False)

In [None]:
# Plot history: MSE

plt.plot(history.history['mean_squared_error'], label='MSE (training data)')
plt.plot(history.history['val_mean_squared_error'], label='MSE (validation data)')
plt.ylabel('MSE value')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")
plt.show()

In [None]:
test_pred = model.predict(X_test,batch_size=1)
train_pred = model.predict(X_train,batch_size=1)

In [None]:
# Seeing the booking prediction
# Reshape y_pred and train_pred

test_pred_inv = test_pred .reshape(test_pred .shape[0], 1, test_pred .shape[1])
train_pred_inv = train_pred.reshape(train_pred.shape[0], 1, train_pred.shape[1])

In [None]:
# Rebuild test set for inverse transform

pred_test_set = []
for index in range(0,len(test_pred )):
    np.concatenate([test_pred_inv[index],X_test[index]],axis = 1)
    pred_test_set.append(np.concatenate([test_pred_inv[index],X_test[index]],axis = 1))

In [None]:
# Rebuild train set for inverse transform

pred_train_set = []
for index in range(0,len(train_pred)):
    np.concatenate([train_pred_inv[index],X_train[index]],axis = 1)
    pred_train_set.append(np.concatenate([train_pred_inv[index],X_train[index]],axis = 1))

In [None]:
# Reshape pred_test_set and pred_train_set

pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])
pred_train_set = np.array(pred_train_set)
pred_train_set = pred_train_set.reshape(pred_train_set.shape[0], pred_train_set.shape[2])

In [None]:
# Inverse transform

pred_test_set_inverted = scaler.inverse_transform(pred_test_set)
pred_train_set_inverted = scaler.inverse_transform(pred_train_set)

In [None]:
# Create dataframe that shows the predicted sales (test)

result_list = []
booking_dates = list(df_budgets_new[-6:].date)
act_bookings = list(df_budgets_new[-6:].booked)
for i in range(0,len(pred_test_set_inverted)):
    result_dict = {}
    result_dict['pred_value'] = int(pred_test_set_inverted[i][0] + act_bookings[i])
    result_dict['date'] = booking_dates[i]
    result_list.append(result_dict)
df_result = pd.DataFrame(result_list)

In [None]:
# Create dataframe that shows the predicted sales (train)

result_list2 = []
booking_dates2 = list(train_list.date)
act_bookings2 = list(train_list.booked)
for i in range(0,len(pred_train_set_inverted)):
    result_dict2 = {}
    result_dict2['pred_value'] = int(pred_train_set_inverted[i][0] + act_bookings2[i])
    result_dict2['date'] = booking_dates2[i]
    result_list2.append(result_dict2)
df_result2 = pd.DataFrame(result_list2)

In [None]:
# Merge with actual sales dataframe

df_budgets_test6 = pd.merge(df_budgets_new,df_result,on = 'date',how = 'left')
df_budgets_train6 = pd.merge(df_budgets_new,df_result2,on = 'date',how = 'left')

In [None]:
# Plot actual and predicted

plot_data = [
    go.Scatter(
        x = df_budgets_new['date'],
        y = df_budgets_new['booked'],
        name='Actual'
    ),
        go.Scatter(
        x = df_budgets_test6['date'],
        y = df_budgets_test6['pred_value'],
        name = 'Test Prediction'
    ),
        go.Scatter(
        x = df_budgets_train6['date'],
        y = df_budgets_train6['pred_value'],
        name = 'Train Prediction'
    )
    
]

plot_layout = go.Layout(
        title = 'Booking Prediction Using GRU (Adam)'
    )
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

In [None]:
rmse = sqrt(mean_squared_error(y_test, test_pred))
print('Test RMSE: %.3f' % rmse)

mae = sqrt(mean_absolute_error(y_test, test_pred))
print('Test MAE: %.3f' % mae)

In [None]:
rmse = sqrt(mean_squared_error(y_train, train_pred))
print('Train RMSE: %.3f' % rmse)

mae = sqrt(mean_absolute_error(y_train, train_pred))
print('Train MAE: %.3f' % mae)

### Future Predictions

In [None]:
train_set = df_model.values

In [None]:
# Apply Min Max Scaler

scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_set)

In [None]:
# Reshape training set

train_set = train_set.reshape(train_set.shape[0], train_set.shape[1])
train_set_scaled = scaler.transform(train_set)

In [None]:
X_train, y_train = train_set_scaled[:, 1:], train_set_scaled[:, 0:1]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])

In [None]:
np.random.seed(1337)
print('Building model...')
model = Sequential()
model.add(GRU(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful = True))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('relu'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])

In [None]:
model.fit(X_train, y_train, epochs = 10, batch_size = 1, verbose = 0, shuffle = False)

In [None]:
y_pred = model.predict(X_train,batch_size=1)

In [None]:
y_pred = y_pred[:-13]

In [None]:
# Seeing the booking prediction
# Reshape y_pred

y_pred = y_pred.reshape(y_pred.shape[0], 1, y_pred.shape[1])

In [None]:
# Rebuild for inverse transform

pred_set = []
for index in range(0,len(y_pred)):
    np.concatenate([y_pred[index],X_train[index]],axis = 1)
    pred_set.append(np.concatenate([y_pred[index],X_train[index]],axis = 1))

In [None]:
pred_set = np.array(pred_set)
pred_set = pred_set.reshape(pred_set.shape[0], pred_set.shape[2])

In [None]:
# Inverse transform

pred_set_inverted = scaler.inverse_transform(pred_set)

In [None]:
df_budgets = df_budgets.set_index('date')

In [None]:
df_budgets.tail()

In [None]:
# Generate future dates

add_dates = [df_budgets.index[-1] + DateOffset(months = x) for x in range(0,13)]
future_dates = pd.DataFrame(index = add_dates[1:],columns = df_budgets.columns)

In [None]:
future_dates = future_dates.rename_axis('date').reset_index()
future_dates.tail()

In [None]:
# Create dataframe that shows the predicted sales

result_list = []
booking_dates = list(future_dates[-12:].date)
for i in range(0,len(pred_set_inverted)):
    result_dict = {}
    result_dict['pred_value'] = int(pred_set_inverted[i][0])
    result_dict['date'] = booking_dates[i]
    result_list.append(result_dict)
df_result = pd.DataFrame(result_list)

In [None]:
df_budgets = df_budgets.rename_axis('date').reset_index()
df_budgets.tail()

In [None]:
# Merge with actual sales dataframe

df_budgets_forecast = pd.concat([df_budgets_new,df_result])
df_budgets_forecast = df_budgets_forecast.reset_index(drop = True)

In [None]:
df_budgets_forecast2 = df_budgets_test5.iloc[19:25]
df_budgets_forecast2

In [None]:
df_budgets_forecast = df_budgets_forecast.fillna(df_budgets_forecast2)

In [None]:
# Plot actual and predicted

plot_data = [
    go.Scatter(
        x = df_budgets_forecast['date'],
        y = df_budgets_forecast['booked'],
        name = 'Actual'
    ),
        go.Scatter(
        x = df_budgets_forecast['date'],
        y = df_budgets_forecast['pred_value'],
        name = 'Predicted'
    )
    
]

plot_layout = go.Layout(
        title = 'Future Booking Prediction'
    )
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

In [None]:
print('End of notebook.')