In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from scipy.stats import boxcox
from plotnine import *
from plotnine import ggplot, aes, geom_line
from pmdarima.arima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from tqdm.notebook import tqdm

In [2]:
# Load Data
calendar = pd.read_csv("Data/calendar_afcs2020.csv")
sales = pd.read_csv("Data/sales_train_evaluation_afcs2020.csv")
train = pd.read_csv("Data/sales_train_validation_afcs2020.csv")
sample_submission = pd.read_csv("Data/sample_submission_afcs2020.csv")
price = pd.read_csv("Data/sell_prices_afcs2020.csv")

In [3]:
timesteps = 14
startDay = 0

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Columns: 1914 entries, id to d_1913
dtypes: int64(1913), object(1)
memory usage: 2.2+ MB


In [5]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

In [6]:
dt = downcast_dtypes(train)

In [7]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Columns: 1914 entries, id to d_1913
dtypes: int16(1913), object(1)
memory usage: 558.0+ KB


In [8]:
dt = dt.T
dt.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,139,140,141,142,143,144,145,146,147,148
id,HOBBIES_2_001_CA_3_validation,HOBBIES_2_002_CA_3_validation,HOBBIES_2_003_CA_3_validation,HOBBIES_2_004_CA_3_validation,HOBBIES_2_005_CA_3_validation,HOBBIES_2_006_CA_3_validation,HOBBIES_2_007_CA_3_validation,HOBBIES_2_008_CA_3_validation,HOBBIES_2_009_CA_3_validation,HOBBIES_2_010_CA_3_validation,...,HOBBIES_2_140_CA_3_validation,HOBBIES_2_141_CA_3_validation,HOBBIES_2_142_CA_3_validation,HOBBIES_2_143_CA_3_validation,HOBBIES_2_144_CA_3_validation,HOBBIES_2_145_CA_3_validation,HOBBIES_2_146_CA_3_validation,HOBBIES_2_147_CA_3_validation,HOBBIES_2_148_CA_3_validation,HOBBIES_2_149_CA_3_validation
d_1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
d_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
d_3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
d_4,0,1,0,0,0,1,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0


In [9]:
dt = dt[1 + startDay:]
dt.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,139,140,141,142,143,144,145,146,147,148
d_1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
d_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
d_3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
d_4,0,1,0,0,0,1,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
d_5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
daysBeforeEvent = pd.DataFrame(np.zeros((1969,1)))

In [11]:
for x,y in calendar.iterrows():
    if((pd.isnull(calendar["event_name_1"][x])) == False):
           daysBeforeEvent[0][x-1] = 1 

In [12]:
del calendar

In [13]:
daysBeforeEventTest = daysBeforeEvent[1913:1941]
daysBeforeEvent = daysBeforeEvent[startDay:1913]

In [14]:
daysBeforeEvent.columns = ["oneDayBeforeEvent"]
daysBeforeEvent.index = dt.index

In [15]:
dt = pd.concat([dt, daysBeforeEvent], axis = 1)
dt.columns

Index([                  0,                   1,                   2,
                         3,                   4,                   5,
                         6,                   7,                   8,
                         9,
       ...
                       140,                 141,                 142,
                       143,                 144,                 145,
                       146,                 147,                 148,
       'oneDayBeforeEvent'],
      dtype='object', length=150)

In [16]:
#Feature Scaling
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
dt_scaled = sc.fit_transform(dt)

In [17]:
X_train = []
y_train = []
for i in range(timesteps, 1913 - startDay):
    X_train.append(dt_scaled[i-timesteps:i])
    y_train.append(dt_scaled[i][0:149]) 

In [18]:
del dt_scaled

In [19]:
#Convert to np array to be able to feed the LSTM model
X_train = np.array(X_train)
y_train = np.array(y_train)
print(X_train.shape)
print(y_train.shape)

(1899, 14, 150)
(1899, 149)


In [20]:
# Importing the Keras libraries and packages
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
import tensorflow as tf


In [22]:
## Initialising the RNN
regressor = Sequential()

# Adding the first LSTM layer and some Dropout regularisation
layer_1_units=50
regressor.add(LSTM(units = layer_1_units, return_sequences = True, input_shape = (X_train.shape[1], X_train.shape[2])))
regressor.add(Dropout(0.2))

# Adding a second LSTM layer and some Dropout regularisation
layer_2_units=400
regressor.add(LSTM(units = layer_2_units, return_sequences = True))
regressor.add(Dropout(0.2))

# Adding a third LSTM layer and some Dropout regularisation
layer_3_units=400
regressor.add(LSTM(units = layer_3_units))
regressor.add(Dropout(0.2))

# Adding the output layer
regressor.add(Dense(units = 149))

# Compiling the RNN
regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

# Fitting the RNN to the Training set
epoch_no=200
batch_size_RNN=16
regressor.fit(X_train, y_train, epochs = epoch_no, batch_size = batch_size_RNN)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<tensorflow.python.keras.callbacks.History at 0x27cf561ae20>

In [23]:
inputs= dt[-timesteps:]
inputs = sc.transform(inputs)

In [24]:
X_test = []
X_test.append(inputs[0:timesteps])
X_test = np.array(X_test)
predictions = []

for j in range(timesteps,timesteps + 28):
    predicted_stock_price = regressor.predict(X_test[0,j - timesteps:j].reshape(1, timesteps, 150))
    testInput = np.column_stack((np.array(predicted_stock_price), daysBeforeEventTest[0][1913 + j - timesteps]))
    X_test = np.append(X_test, testInput).reshape(1,j + 1,150)
    predicted_stock_price = sc.inverse_transform(testInput)[:,0:149]
    predictions.append(predicted_stock_price)

In [25]:
idColumn = sample_submission[["id"]]
output_df = pd.DataFrame(data=np.array(predictions).reshape(28,149)).T
output_df = pd.concat([idColumn, output_df], axis=1) 
output_df.columns = sample_submission.columns

output_df.to_csv("Output/output_basic_LSTM_7.csv", index=False)