In [1]:
import numpy as np #Numpy for array Calculation
import pandas as pd #Pandas for importing data

from keras.models import Model, Sequential #Model
from keras.optimizers import SGD, Adam, RMSprop #Optimizer
from keras.layers import Input, Dense, Dropout, Flatten, Lambda, Embedding #Hidden Layers
from keras.initializers import RandomNormal, Constant #Random Weight Initializers
from keras import backend as K #Keras 
import tensorflow as tf #Tensorflow

from sklearn.preprocessing import StandardScaler #Standardization
from sklearn.model_selection import train_test_split #Train,Test,Split
from sklearn.metrics import mean_squared_error #Evaluation metrics

from math import sqrt #Math module for rmse calculation (i.e sqrt(mse))

import warnings #To Ignore warnings which is annoying in notebook
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

In [3]:
#Loading Train,Test,Store data
train = pd.read_csv(r'E:\Documents\University Assignments\NN and DL\Group Project\train.csv')
test = pd.read_csv(r'E:\Documents\University Assignments\NN and DL\Group Project\test.csv')
store = pd.read_csv(r'E:\Documents\University Assignments\NN and DL\Group Project\store.csv')

#Creating new sales variable in test so later I can seprate train and test after concatenation
test['Sales'] = -1

#Concatenating both train and test so we can analyse and pre-process booth at once
full = pd.concat([train, test]).reset_index(drop=True)

# Merging store and preprocessing data

In [4]:
#Merge left because when tried with inner merge we were losing data which are not common and getting less accuracy in kaggle
#so left merging was used to get extra information about stores
full = full.merge(store, on=['Store'], how='left')

In [5]:
full.dtypes

Store                          int64
DayOfWeek                      int64
Date                          object
Sales                          int64
Customers                    float64
Open                         float64
Promo                          int64
StateHoliday                  object
SchoolHoliday                  int64
Id                           float64
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                 object
dtype: object

In [6]:
full['Date'].head(2)

0    2015-07-31
1    2015-07-31
Name: Date, dtype: object

In [7]:
#Creating year,month,day,week attribute from date variable
full['Year'] = pd.DatetimeIndex(full['Date']).year
full['Month'] = pd.DatetimeIndex(full['Date']).month
full['Day'] = pd.DatetimeIndex(full['Date']).day
full['WeekOfYear'] = pd.DatetimeIndex(full['Date']).weekofyear

In [8]:
full.head(2)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,...,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear
0,1,5,2015-07-31,5263,555.0,1.0,1,0,1,,...,9.0,2008.0,0,,,,2015,7,31,31
1,2,5,2015-07-31,6064,625.0,1.0,1,0,1,,...,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,31,31


In [9]:
#From unique we can see all other categorical varibles can be handled by one-hot encoding except PROMOINTERVAL
# So lets handle and clean that data first.

full['StateHoliday'].unique(),full['StoreType'].unique(),full['Assortment'].unique(),full['PromoInterval'].unique()

(array(['0', 'a', 'b', 'c', 0], dtype=object),
 array(['c', 'a', 'd', 'b'], dtype=object),
 array(['a', 'c', 'b'], dtype=object),
 array([nan, 'Jan,Apr,Jul,Oct', 'Feb,May,Aug,Nov', 'Mar,Jun,Sept,Dec'],
       dtype=object))

In [10]:
# Converting month to new column called month_string because from this we can deduce which month promo interval is there
# because string can be compared to promointerval. either conver promo interval to number and compare it to month float or convert 
# float month to string to compare it to promointerval
month_string = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
full['month_string'] = full.Month.map(month_string) 
#I use .map function it is old python 2 version but does the job to compete with other essex uni students and get results:).

# create variable ispromo and make all 0,later add 1 flag so by comparing it both. 
full['IsPromoMonth'] = 0

# creating for loop of only unique promointerval which is 4 and compare the split month of promo interval
# with month_string to get the flag 
for i in full.PromoInterval.unique():
    i = str(i)
    if i != '':
        for month in i.split(','):
            full.loc[(full.month_string == month) & (full.PromoInterval == i), 'IsPromoMonth'] = 1

In [11]:
#Now if you see month_string which is in promointerval will be flagged as 1 else 0
full.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,...,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,month_string,IsPromoMonth
0,1,5,2015-07-31,5263,555.0,1.0,1,0,1,,...,0,,,,2015,7,31,31,Jul,0
1,2,5,2015-07-31,6064,625.0,1.0,1,0,1,,...,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,31,31,Jul,1
2,3,5,2015-07-31,8314,821.0,1.0,1,0,1,,...,1,14.0,2011.0,"Jan,Apr,Jul,Oct",2015,7,31,31,Jul,1
3,4,5,2015-07-31,13995,1498.0,1.0,1,0,1,,...,0,,,,2015,7,31,31,Jul,0
4,5,5,2015-07-31,4822,559.0,1.0,1,0,1,,...,0,,,,2015,7,31,31,Jul,0


In [12]:
#Get missing value % for all data(train+test).
for i in full.columns:
    if full[i].isnull().sum()/full.shape[0]*100 > 0:
        print('The Missing Values of Columns ',i,' is:- ',full[i].isnull().sum()/full[i].shape[0]*100)

The Missing Values of Columns  Customers  is:-  3.8824639964017664
The Missing Values of Columns  Open  is:-  0.0010394057622765632
The Missing Values of Columns  Id  is:-  96.11753600359823
The Missing Values of Columns  CompetitionDistance  is:-  0.25871754337393
The Missing Values of Columns  CompetitionOpenSinceMonth  is:-  31.991397499945666
The Missing Values of Columns  CompetitionOpenSinceYear  is:-  31.991397499945666
The Missing Values of Columns  Promo2SinceWeek  is:-  49.632853537334036
The Missing Values of Columns  Promo2SinceYear  is:-  49.632853537334036
The Missing Values of Columns  PromoInterval  is:-  49.632853537334036


In [13]:
full.dtypes

Store                          int64
DayOfWeek                      int64
Date                          object
Sales                          int64
Customers                    float64
Open                         float64
Promo                          int64
StateHoliday                  object
SchoolHoliday                  int64
Id                           float64
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                 object
Year                           int64
Month                          int64
Day                            int64
WeekOfYear                     int64
month_string                  object
IsPromoMonth                   int64
dtype: object

In [14]:
#Making seprate columns for Numerical Variable and categorical variable

#Numeric variable columns
cols_num = ["Sales", "DayOfWeek", "Open", "Promo", "SchoolHoliday", "CompetitionDistance",
                "CompetitionOpenSinceMonth", "CompetitionOpenSinceYear", "Promo2",
                "Promo2SinceWeek", "Promo2SinceYear", "Year", "Month", "Day",
                "IsPromoMonth", "Store"]

#categorical variable columns
cols_text = ["StateHoliday", "StoreType", "Assortment"]

In [15]:
#making numeric DataFrame
full_numeric = full[cols_num]

In [16]:
# Only column CompetitionDistance is fill NaN with a median value because it has less than 30% null values.
full_numeric['CompetitionDistance'].fillna(full_numeric['CompetitionDistance'].median(), inplace = True)

#Fill rest null value with 0 because all other variables has more than 30% null values.
full_numeric.fillna(0, inplace = True)

In [17]:
#cross Checking if any variables has null values
full_numeric.isnull().sum()

Sales                        0
DayOfWeek                    0
Open                         0
Promo                        0
SchoolHoliday                0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
Year                         0
Month                        0
Day                          0
IsPromoMonth                 0
Store                        0
dtype: int64

In [18]:
full_numeric.head()

Unnamed: 0,Sales,DayOfWeek,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day,IsPromoMonth,Store
0,5263,5,1.0,1,1,1270.0,9.0,2008.0,0,0.0,0.0,2015,7,31,0,1
1,6064,5,1.0,1,1,570.0,11.0,2007.0,1,13.0,2010.0,2015,7,31,1,2
2,8314,5,1.0,1,1,14130.0,12.0,2006.0,1,14.0,2011.0,2015,7,31,1,3
3,13995,5,1.0,1,1,620.0,9.0,2009.0,0,0.0,0.0,2015,7,31,0,4
4,4822,5,1.0,1,1,29910.0,4.0,2015.0,0,0.0,0.0,2015,7,31,0,5


In [19]:
#creating/assigning categirical DataFrame
full_categorical = full[cols_text]

#Convert categorical to numerical using one-hot encoding
full_categorical = pd.get_dummies(full_categorical, dummy_na=False)

In [20]:
full_categorical.head()

Unnamed: 0,StateHoliday_0,StateHoliday_0.1,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c
0,0,1,0,0,0,0,0,1,0,1,0,0
1,0,1,0,0,0,1,0,0,0,1,0,0
2,0,1,0,0,0,1,0,0,0,1,0,0
3,0,1,0,0,0,0,0,1,0,0,0,1
4,0,1,0,0,0,1,0,0,0,1,0,0


In [21]:
full_categorical.isnull().sum()

StateHoliday_0    0
StateHoliday_0    0
StateHoliday_a    0
StateHoliday_b    0
StateHoliday_c    0
StoreType_a       0
StoreType_b       0
StoreType_c       0
StoreType_d       0
Assortment_a      0
Assortment_b      0
Assortment_c      0
dtype: int64

In [22]:
#concatinating final data
full = pd.concat([full_numeric, full_categorical], axis = 1)

In [23]:
full.head()

Unnamed: 0,Sales,DayOfWeek,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,...,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c
0,5263,5,1.0,1,1,1270.0,9.0,2008.0,0,0.0,...,0,0,0,0,0,1,0,1,0,0
1,6064,5,1.0,1,1,570.0,11.0,2007.0,1,13.0,...,0,0,0,1,0,0,0,1,0,0
2,8314,5,1.0,1,1,14130.0,12.0,2006.0,1,14.0,...,0,0,0,1,0,0,0,1,0,0
3,13995,5,1.0,1,1,620.0,9.0,2009.0,0,0.0,...,0,0,0,0,0,1,0,0,0,1
4,4822,5,1.0,1,1,29910.0,4.0,2015.0,0,0.0,...,0,0,0,1,0,0,0,1,0,0


In [24]:
#Split concatenated data into train and test again (This could be done by using -1 flag which I did for train[sales])
full_train = full.loc[full['Sales'] != -1]
full_test = full.loc[full['Sales'] == -1]

In [25]:
full_train.head()

Unnamed: 0,Sales,DayOfWeek,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,...,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c
0,5263,5,1.0,1,1,1270.0,9.0,2008.0,0,0.0,...,0,0,0,0,0,1,0,1,0,0
1,6064,5,1.0,1,1,570.0,11.0,2007.0,1,13.0,...,0,0,0,1,0,0,0,1,0,0
2,8314,5,1.0,1,1,14130.0,12.0,2006.0,1,14.0,...,0,0,0,1,0,0,0,1,0,0
3,13995,5,1.0,1,1,620.0,9.0,2009.0,0,0.0,...,0,0,0,0,0,1,0,0,0,1
4,4822,5,1.0,1,1,29910.0,4.0,2015.0,0,0.0,...,0,0,0,1,0,0,0,1,0,0


In [26]:
full_train.shape

(1017209, 28)

In [27]:
full_test.head()

Unnamed: 0,Sales,DayOfWeek,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,...,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c
1017209,-1,4,1.0,1,0,1270.0,9.0,2008.0,0,0.0,...,0,0,0,0,0,1,0,1,0,0
1017210,-1,4,1.0,1,0,14130.0,12.0,2006.0,1,14.0,...,0,0,0,1,0,0,0,1,0,0
1017211,-1,4,1.0,1,0,24000.0,4.0,2013.0,0,0.0,...,0,0,0,1,0,0,0,0,0,1
1017212,-1,4,1.0,1,0,7520.0,10.0,2014.0,0,0.0,...,0,0,0,1,0,0,0,1,0,0
1017213,-1,4,1.0,1,0,2030.0,8.0,2000.0,0,0.0,...,0,0,0,1,0,0,0,0,0,1


In [28]:
full_test.shape

(41088, 28)

# Model Building

In [29]:
#Function to split the data into 80%-Train 20%test (not manuall split) - using sklearn model selection to split 
def load_train_data(scaler_x, scaler_y):

    X_train = train.drop(["Sales"], axis=1) #Independent Variable
    y_train = np.array(train["Sales"]).reshape((len(X_train), 1)) # Depedendent Variable 
    
    #Using scaler Transform 
    X_train = scaler_x.fit_transform(X_train)
    y_train = scaler_y.fit_transform(y_train) 
    
    #Splitting data into 80% Train and 20% Test
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

    return (X_train, y_train), (X_test, y_test)

In [30]:
#Creating fucntion to load test data for predicition
def load_test_data():
    
    X_test = test.drop(["Sales"], axis=1) # Dependent variable
    X_test = StandardScaler().fit_transform(X_test)

    return X_test

In [31]:
#Create Function for root mean square percentage error
def rmspe_val(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true), axis=0))[0]

def rmspe(y_true, y_pred):
    return K.sqrt(K.mean(K.square((y_true - y_pred) / y_true), axis=0))

def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [44]:
#Create function to build model
def create_model():
    initializer = RandomNormal(mean=0.0, stddev=0.05, seed=None)
    n_steps = 1
    
    model = Sequential()
    model.add(LSTM(512, activation='relu'))
    model.add(Dense(1, activation="linear"))
    adam = Adam(lr=1e-3, decay=1e-3)

    # Compile model
    model.compile(loss="mean_squared_error", optimizer=adam, metrics=[rmse, rmspe])

    return model

In [45]:
train, test = full_train, full_test

In [46]:
# Hyperparameters and load data to train the model
batch_size = 512 #Number of batch (i.e how many row at a time)
nb_epoch = 50 #Number of epoches (i.e Iteration)


scaler_x = StandardScaler() #Standardization
scaler_y = StandardScaler()

(X_train, y_train), (X_test, y_test) = load_train_data(scaler_x, scaler_y) #Loading Standardization train,test data

X_train = X_train.reshape(-1, 1, 27)
X_test  = X_test.reshape(-1, 1, 27)
y_train = y_train.reshape(-1, 1, 1)
y_test = y_test.reshape(-1, 1, 1)

model = create_model()
#model.summary()

In [47]:
#Fitting Train data to model
log = model.fit(X_train, y_train, validation_split=0.20, batch_size=batch_size, epochs=nb_epoch, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50


Epoch 48/50
Epoch 49/50
Epoch 50/50


In [48]:
#Function for MSE,RMSE,RMSPE Calculation (Evalutation Metrics)
def show_info(model, X, y, weights = None):
    
    global predictions
    predictions = model.predict(X, verbose=1)

show_info(model,X_test,y_test)



In [49]:
#Loading test data for prediction and submission.
test_data = load_test_data()
test_data = test_data.reshape(-1, 1, 27)

#Loading data because need to access ID variable for submission to kaggle
df_teste = pd.read_csv(r'E:\Documents\University Assignments\NN and DL\Group Project\test.csv')

In [50]:
#Predict test data for submission
predict = model.predict(test_data)

#Convert data from scaler by using inverse transfirmation
predict = scaler_y.inverse_transform(predict)



In [51]:
#creating new dataframe to put our predicted data
submission = pd.DataFrame()

#ID Variable accessed from reading test data again in above cell and creating dataframe intop format for kaggle submission
submission['Id'] = df_teste["Id"]
submission['Sales'] = predict

#Export predicted data into csv file to upload to kaggle to get result.
submission.to_csv('LSTM.csv', index=False)