In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import warnings
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from keras import optimizers
from keras.utils import plot_model
from keras.models import Sequential, Model
from sklearn.model_selection import KFold
from sklearn import preprocessing
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import mean_absolute_error

%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.4f' % x)
warnings.filterwarnings("ignore")

# Setting seeds to make the project more reproducible.
from numpy.random import seed
seed(0)
from tensorflow import set_random_seed
set_random_seed(0)

from sklearn.ensemble import RandomForestRegressor
import seaborn as sns; sns.set(style="ticks", color_codes=True)

In [0]:
# read data
train = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Business_Analytics/train.csv",sep=',', header=0)
features = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Business_Analytics/features.csv",sep=',', header=0).drop(columns=['IsHoliday'])
stores = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/Business_Analytics/stores.csv",sep=',', header=0)

# merge the different files into one dataset to work with
dataset = train.merge(stores, how='left').merge(features, how='left')
dataset = dataset[['Weekly_Sales','Store', 'Dept', 'Date', 'IsHoliday', 'Type', 'Size', 'Temperature', 'Fuel_Price',
                   'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']]

In [0]:
pd.set_option("display.max_columns", 17)
pd.set_option('display.width', 300)

dataset.head()


Unnamed: 0,Weekly_Sales,Store,Dept,Date,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,24924.5,1,1,2010-02-05,False,A,151315,42.31,2.572,,,,,,211.0964,8.106
1,46039.49,1,1,2010-02-12,True,A,151315,38.51,2.548,,,,,,211.2422,8.106
2,41595.55,1,1,2010-02-19,False,A,151315,39.93,2.514,,,,,,211.2891,8.106
3,19403.54,1,1,2010-02-26,False,A,151315,46.63,2.561,,,,,,211.3196,8.106
4,21827.9,1,1,2010-03-05,False,A,151315,46.5,2.625,,,,,,211.3501,8.106


In [89]:
print('\n',dataset.describe())


        Weekly_Sales       Store        Dept        Size  Temperature  Fuel_Price   MarkDown1   MarkDown2   MarkDown3   MarkDown4   MarkDown5         CPI  Unemployment
count   421570.0000 421570.0000 421570.0000 421570.0000  421570.0000 421570.0000 150681.0000 111248.0000 137091.0000 134967.0000 151432.0000 421570.0000   421570.0000
mean     15981.2581     22.2005     44.2603 136727.9157      60.0901      3.3610   7246.4202   3334.6286   1439.4214   3383.1683   4628.9751    171.2019        7.9603
std      22711.1835     12.7853     30.4921  60980.5833      18.4479      0.4585   8291.2213   9475.3573   9623.0783   6292.3840   5962.8875     39.1593        1.8633
min      -4988.9400      1.0000      1.0000  34875.0000      -2.0600      2.4720      0.2700   -265.7600    -29.1000      0.2200    135.1600    126.0640        3.8790
25%       2079.6500     11.0000     18.0000  93638.0000      46.6800      2.9330   2240.2700     41.6000      5.0800    504.2200   1878.4400    132.0227        6.8

In [0]:
cat_col = ['IsHoliday','Type']

for col in cat_col:
    encoder = preprocessing.LabelEncoder()
    encoder.fit(dataset[col].values.astype('str'))
    dataset[col] = encoder.transform(dataset[col].values.astype('str'))
    
dataset.head()

Unnamed: 0,Weekly_Sales,Store,Dept,Date,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,24924.5,1,1,2010-02-05,0,0,151315,42.31,2.572,,,,,,211.0964,8.106
1,46039.49,1,1,2010-02-12,1,0,151315,38.51,2.548,,,,,,211.2422,8.106
2,41595.55,1,1,2010-02-19,0,0,151315,39.93,2.514,,,,,,211.2891,8.106
3,19403.54,1,1,2010-02-26,0,0,151315,46.63,2.561,,,,,,211.3196,8.106
4,21827.9,1,1,2010-03-05,0,0,151315,46.5,2.625,,,,,,211.3501,8.106


In [0]:
dataset = dataset.fillna(0)
dataset.head()

Unnamed: 0,Weekly_Sales,Store,Dept,Date,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,24924.5,1,1,2010-02-05,0,0,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.0964,8.106
1,46039.49,1,1,2010-02-12,1,0,151315,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.2422,8.106
2,41595.55,1,1,2010-02-19,0,0,151315,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.2891,8.106
3,19403.54,1,1,2010-02-26,0,0,151315,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.3196,8.106
4,21827.9,1,1,2010-03-05,0,0,151315,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.3501,8.106


In [0]:
for i in ['Weekly_Sales','MarkDown2','MarkDown3']:
    indexNames = dataset[dataset[i] < 0 ].index
    dataset.drop(indexNames , inplace=True, axis=0)

In [0]:
dataset['Date'] = pd.to_datetime(dataset['Date'], format='%Y/%m/%d')
dataset['Date'] = dataset['Date'].dt.date
first_date = (np.min(dataset['Date']))
last_date = (np.max(dataset['Date']))
print('first date:',first_date,'\nlast date:',last_date)

first date: 2010-02-05 
last date: 2012-10-26


In [0]:
one_year = last_date - pd.Timedelta(54, 'w')
dataset = dataset.loc[dataset['Date'] > one_year]

In [0]:
dataset_sales_series = dataset.pivot_table(index =['Store', 'Dept'], columns='Date', values='Weekly_Sales', fill_value=0).reset_index()

In [0]:
dataset_sales_series['Number of missing values'] = (dataset_sales_series == 0).astype(int).sum(axis=1)

In [0]:
dataset_sales_series = dataset_sales_series.loc[dataset_sales_series['Number of missing values'] <= 5]
dataset_sales_series = dataset_sales_series.drop(columns='Number of missing values')
dataset_sales_series.head()

Date,Store,Dept,2011-10-21,2011-10-28,2011-11-04,2011-11-11,2011-11-18,2011-11-25,...,2012-09-07,2012-09-14,2012-09-21,2012-09-28,2012-10-05,2012-10-12,2012-10-19,2012-10-26
0,1,1,23351.8,31579.9,39886.06,18689.54,19050.66,20911.25,...,18322.37,19616.22,19251.5,18947.81,21904.47,22764.01,24185.27,27390.81
1,1,2,42487.67,41682.4,47313.62,44936.47,43997.78,44259.59,...,47344.5,44493.61,43541.07,45784.76,48577.08,42112.67,42354.72,43134.88
2,1,3,10136.78,9421.2,9189.2,9959.64,10104.36,9317.56,...,18368.51,14288.22,13403.63,13085.95,11676.98,10487.17,8548.87,9350.9
3,1,4,33954.55,34247.26,39354.84,36826.52,37656.58,46564.14,...,39549.27,35044.06,34507.34,34647.33,39311.93,35446.18,35549.19,36292.6
4,1,5,32700.34,26391.79,31956.07,31002.65,27339.37,79340.15,...,18236.15,19369.52,25798.78,22560.39,25508.81,20920.03,20413.83,25846.94


In [0]:
dataset_series_total = dataset_sales_series.merge(stores, how='left')

cat_col = ['Type']
for col in cat_col:
    encoder = preprocessing.LabelEncoder()
    encoder.fit(dataset_series_total[col].values.astype('str'))
    dataset_series_total[col] = encoder.transform(dataset_series_total[col].values.astype('str'))
    
dataset_series_total.head()

Unnamed: 0,Store,Dept,2011-10-21,2011-10-28,2011-11-04,2011-11-11,2011-11-18,2011-11-25,...,2012-09-21,2012-09-28,2012-10-05,2012-10-12,2012-10-19,2012-10-26,Type,Size
0,1,1,23351.8,31579.9,39886.06,18689.54,19050.66,20911.25,...,19251.5,18947.81,21904.47,22764.01,24185.27,27390.81,0,151315
1,1,2,42487.67,41682.4,47313.62,44936.47,43997.78,44259.59,...,43541.07,45784.76,48577.08,42112.67,42354.72,43134.88,0,151315
2,1,3,10136.78,9421.2,9189.2,9959.64,10104.36,9317.56,...,13403.63,13085.95,11676.98,10487.17,8548.87,9350.9,0,151315
3,1,4,33954.55,34247.26,39354.84,36826.52,37656.58,46564.14,...,34507.34,34647.33,39311.93,35446.18,35549.19,36292.6,0,151315
4,1,5,32700.34,26391.79,31956.07,31002.65,27339.37,79340.15,...,25798.78,22560.39,25508.81,20920.03,20413.83,25846.94,0,151315


In [0]:
X_df = dataset_series_total.drop(columns=['Store', dt.date(2012,10,26)])
Y_df = dataset_series_total[[dt.date(2012,10,26)]]

train_df, valid_df, Y_train_df, Y_valid_df = train_test_split(X_df,Y_df, test_size=0.1, random_state=0)

X_train_df = train_df.drop(columns=['Dept','Type','Size'])
X_valid_df = valid_df.drop(columns=['Dept','Type','Size'])

print('X_train_df:','\n','\n', X_train_df.head(),)

X_train_df: 
 
       2011-10-21  2011-10-28  2011-11-04  2011-11-11  2011-11-18  2011-11-25  2011-12-02  2011-12-09  ...  2012-08-31  2012-09-07  2012-09-14  2012-09-21  2012-09-28  2012-10-05  2012-10-12  2012-10-19
1244  22189.7400  21685.9100  23132.2100  23049.4200  22196.5900  18596.7400  20159.7800  23034.8200  ...  21214.5100  23350.2800  21684.7200  21343.9700  20359.4600  22551.9600  21909.9100  20174.7500
768     243.0600    185.8800    279.1100    416.3800    413.7100    274.7200    366.3400    468.2900  ...    349.7800    221.9100      0.0000     71.0000     71.0600    246.5500    101.7500    141.2100
1010   8455.8500   7579.0600   8484.0300  10362.8300   8184.8100  12812.3100   9020.5000  11484.5800  ...   7874.9800      0.0000   7266.0900   7484.7200   7002.9200   8113.7700   8680.7700   9075.7300
2686   3723.3000   3111.5100   2832.6500   3536.0700   3909.7600   2765.7100   3842.4100   4227.5000  ...   3429.0000   4604.0900   4368.1800   3929.6800   3566.5100   4096.340

In [0]:
scaler = MinMaxScaler()
X_train_np = scaler.fit_transform(np.array(X_train_df))
X_valid_np = scaler.transform(np.array(X_valid_df))

In [0]:
X_train_lstm = X_train_np.reshape((X_train_np.shape[0], X_train_np.shape[1],1))
X_valid_lstm = X_valid_np.reshape((X_valid_np.shape[0], X_valid_np.shape[1],1))

print(X_train_lstm.shape)
print(X_valid_lstm.shape)

(2525, 53, 1)
(281, 53, 1)


In [0]:
serie_size =  X_train_lstm.shape[1]
n_features =  X_train_lstm.shape[2] 

epochs = 5
batch = 128
lr = 0.0001

In [0]:
encoder_decoder = Sequential()
encoder_decoder.add(LSTM(serie_size, activation='relu', input_shape=(serie_size, n_features), return_sequences=True))
encoder_decoder.add(LSTM(10, activation='relu', return_sequences=True))
encoder_decoder.add(LSTM(1, activation='relu'))
encoder_decoder.add(RepeatVector(serie_size))
encoder_decoder.add(LSTM(serie_size, activation='relu', return_sequences=True))
encoder_decoder.add(LSTM(10, activation='relu', return_sequences=True))
encoder_decoder.add(TimeDistributed(Dense(1)))
encoder_decoder.summary()

adam = optimizers.Adam(lr)
encoder_decoder.compile(loss='mse', optimizer=adam)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_11 (LSTM)               (None, 53, 53)            11660     
_________________________________________________________________
lstm_12 (LSTM)               (None, 53, 10)            2560      
_________________________________________________________________
lstm_13 (LSTM)               (None, 1)                 48        
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 53, 1)             0         
_________________________________________________________________
lstm_14 (LSTM)               (None, 53, 53)            11660     
_________________________________________________________________
lstm_15 (LSTM)               (None, 53, 10)            2560      
_________________________________________________________________
time_distributed_3 (TimeDist (None, 53, 1)             11        
Total para

In [0]:
encoder_decoder_history = encoder_decoder.fit(X_train_lstm, X_train_lstm, epochs=epochs, batch_size=batch, verbose=2)

Epoch 1/5
 - 11s - loss: 0.0229
Epoch 2/5
 - 8s - loss: 0.0216
Epoch 3/5
 - 8s - loss: 0.0203
Epoch 4/5
 - 8s - loss: 0.0189
Epoch 5/5
 - 8s - loss: 0.0176


In [0]:
encoder = Model(inputs=encoder_decoder.inputs, outputs=encoder_decoder.layers[2].output)
plot_model(encoder_decoder, show_shapes=True, to_file='/content/gdrive/My Drive/Colab Notebooks/Business_Analytics/encoder_decoder_reconstruct_lstm.png')
plot_model(encoder, show_shapes=True, to_file='/content/gdrive/My Drive/Colab Notebooks/Business_Analytics/encoder_lstm.png')

In [0]:
X_train_encoded = encoder.predict(X_train_lstm)
X_valid_encoded = encoder.predict(X_valid_lstm)
print('Encoded time-series shape', X_train_encoded.shape)

# print('Encoded time-series sample:')
# for i in range(10): 
#   print(X_train_encoded[i])

Encoded time-series shape (2525, 1)


In [0]:
train_df['Encoded'] = X_train_encoded
train_df['Label'] = Y_train_df

valid_df['Encoded'] = X_valid_encoded
valid_df['Label'] = Y_valid_df

X_train_basic = train_df[[dt.date(2012,10,19), 'Encoded','Dept', 'Type', 'Size']]
X_valid_basic = valid_df[[dt.date(2012,10,19), 'Encoded','Dept', 'Type', 'Size']]

Y_train_basic = train_df['Label']
Y_valid_basic = valid_df['Label']

X_train_basic.describe()

Unnamed: 0,2012-10-19,Encoded,Dept,Type,Size
count,2525.0,2525.0,2525.0,2525.0,2525.0
mean,15836.4286,0.0312,44.4107,0.583,137636.585
std,21434.9465,0.0083,30.8884,0.6631,60570.7549
min,0.0,0.0229,1.0,0.0,34875.0
25%,2436.51,0.0247,17.0,0.0,93638.0
50%,7936.52,0.0285,37.0,0.0,140167.0
75%,19936.24,0.0354,74.0,1.0,202505.0
max,177406.75,0.0782,98.0,2.0,219622.0


Create Numpy-Array

In [0]:
X_train_basic_np = np.array(X_train_basic)
X_valid_basic_np = np.array(X_valid_basic)

Y_train_basic_np = np.array(Y_train_basic)
Y_valid_basic_np = np.array(Y_valid_basic)

In [0]:
RFR_basic = RandomForestRegressor(n_estimators=100,max_features=2, verbose=0, n_jobs=-1)

In [0]:
RFR_basic.fit(X_train_basic_np, Y_train_basic_np)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=2, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [0]:
Y_predict = RFR_basic.predict(X_valid_basic_np)
print("Mean absolute Error:", np.round(mean_absolute_error(Y_valid_basic_np,Y_predict),2),'\n')

feat_importance = pd.DataFrame(RFR_basic.feature_importances_.reshape(1,5), columns=['Current Sales','Past Sales','Dept', 'Type', 'Size'])

print('Feature Importance:','\n','\n',feat_importance)

Mean absolute Error: 1659.11 

Feature Importance: 
 
    Current Sales  Past Sales   Dept   Type   Size
0         0.4911      0.4349 0.0505 0.0020 0.0216


In [0]:
X_train_baseline = train_df[[dt.date(2012,10,19)]]
Y_train_baseline = train_df['Label']

X_valid_baseline = valid_df[[dt.date(2012,10,19)]]
Y_valid_baseline = valid_df['Label']

X_train_baseline_np = scaler.fit_transform(np.array(X_train_baseline))
X_valid_baseline_np = scaler.transform(np.array(X_valid_baseline))

Y_train_baseline_np = np.array(Y_train_baseline)
Y_valid_baseline_np = np.array(Y_valid_baseline)

In [0]:
RFR_basic = RandomForestRegressor(n_estimators=100,max_features=1, verbose=0, n_jobs=-1)

RFR_basic.fit(X_train_baseline_np, Y_train_baseline_np)

Y_predict = RFR_basic.predict(X_valid_baseline_np)

print("Mean absolute Error:", np.round(mean_absolute_error(Y_valid_baseline_np,Y_predict),2),'\n')

Mean absolute Error: 2079.82 

