In [1]:
import warnings

warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow import keras
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot


import datetime
import holidays

import seaborn as sns

In [242]:
#read data
df_trans = pd.read_excel('./demand-forecasting-kernels-only/data_friday_ai.xlsx', engine='openpyxl')
df_trans=df_trans.drop(['customer_id','employee_id','invoice_no.','trans_price'],axis=1)

In [508]:
def sort_data_weekly(df):
    
    sort_df = df.sort_values('time_stamp').groupby(['item_id', 'store_id', 'time_stamp'], as_index=False)
    sort_df = sort_df.agg({'quantity':['sum']})
    sort_df.columns = ['item', 'store', 'date', 'sales']
    
    sort_df=sort_df.set_index('date')
    sort_df['week']=sort_df.index.isocalendar().week
 

#     sort_df['week']=sort_df['date'].isocalendar().week
    return sort_df


def select_item_shop(df,item_id,shop_id):
    df=df[(df['store']==shop_id)&(df['item']==item_id)]
    df_r=df.drop(['item','store'],axis=1)
    return df_r


def additional_info(df):
    
    weekend=[]
    holiday=[]
    season=[]
    sg_holidays = holidays.SG()
    for i, row in df.iterrows():
        weekno=i.weekday()
        
        if weekno < 5:
            weekend.append(0)
        else:  # 5 Sat, 6 Sun
            weekend.append(1)

        if i in sg_holidays:
            holiday.append(1)
        else:
            holiday.append(0)
            
        if str(i.month) in ['1','2','3']:
            season.append(1)
        elif str(i.month) in ['4','5','6']:
            season.append(2)
        elif str(i.month) in ['7','8','9']:
            season.append(3) 
        else:
            season.append(4)

#     df['is_weekend']=weekend
#     df['is_holiday']=holiday
    df['season']=season
    return df

def sum_by_week(df):
    
    df=df.groupby(['week','season']).sum()
    df=df.reset_index()
#     df['weeks'] = df['week']
    
    return df

def train_test_split(df):
    train_size=int(len(df)*0.90)
    test_size = len(df) - train_size
    train, test = df.iloc[0:train_size], df.iloc[train_size:len(df)]
    return train,test

def feature_transformer(df,input_features):
    f_transformer = RobustScaler()
    target_transformer = RobustScaler()

    f_transformer = f_transformer.fit(df[input_features].to_numpy())
    target_transformer = target_transformer.fit(df[['sales']])
    
    df.loc[:, input_features] = f_transformer.transform(df[input_features].to_numpy())
    df['sales'] = target_transformer.transform(df[['sales']])
    
    return df


def data_inverse(df,target_df):
    
    target_transformer = RobustScaler()
    target_transformer = target_transformer.fit(df[['sales']])
    result = target_transformer.inverse_transform(target_df.reshape(1, -1))
    return result


def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)        
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)


    

In [510]:
df=sort_data_weekly(df_trans)
df=select_item_shop(df,22,1)
df=additional_info(df)
df=sum_by_week(df)

#fill in null values

# r = range(1,54)
# df=df.set_index('weeks').reindex(r).fillna(0).rename_axis('weeks').reset_index()
df.head()




Unnamed: 0,week,season,sales
0,1,1,1
1,2,1,1
2,3,1,8
3,4,1,2
4,5,1,1


In [511]:
train,test=train_test_split(df)

# train=feature_transformer(train,['is_weekend', 'is_holiday','weeks'])
# test=feature_transformer(test,['is_weekend', 'is_holiday','weeks'])

train=feature_transformer(train,['week','season'])
test=feature_transformer(test,['week','season'])

In [560]:
#contain how many dates from history (time_steps)
time_steps = 4

# reshape to [samples, time_steps, n_features]

X_train, y_train = create_dataset(train, train.sales, time_steps)
X_test, y_test = create_dataset(test, test.sales, time_steps)

print(X_train.shape, y_train.shape)

(40, 4, 3) (40,)


In [561]:
X_test

array([[[-1. ,  0. ,  0. ],
        [-0.5,  0. ,  0. ],
        [ 0. ,  0. ,  2. ],
        [ 0.5,  0. ,  1. ]]])

In [562]:
y_test

array([0.])

In [563]:
model = keras.Sequential()
model.add(
  keras.layers.Bidirectional(
    keras.layers.LSTM(
      units=32, 
      input_shape=(X_train.shape[1], X_train.shape[2])
    )
  )
)
model.add(keras.layers.Dropout(rate=0.2))
model.add(keras.layers.Dense(units=1))
model.compile(loss='mean_squared_error', optimizer='adam')

In [564]:
history = model.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.1,
    verbose=0,
    shuffle=False
)

In [565]:
train_pred = model.predict(X_train)
valid_pred = model.predict(X_test)
print("LSTM:")
print('Train rmse:', np.sqrt(mean_squared_error(y_train, train_pred)))
print('Validation rmse:', np.sqrt(mean_squared_error(y_test, valid_pred)))

LSTM:
Train rmse: 0.5506502524362327
Validation rmse: 0.06797383725643158


In [566]:
# y_train_inv=data_inverse(df,y_train)

y_test_inv = data_inverse(df,y_test)
y_pred_inv=data_inverse(df,valid_pred)

In [567]:
y_pred_inv

array([[2.1359477]], dtype=float32)

In [568]:
y_test_inv

array([[2.]])