In [1]:
import numpy as np 
import pandas as pd

In [2]:
dataPath = "./data"
timesteps = 14
startDay = 350

In [3]:
dt = pd.read_csv(dataPath + "/sales_train_evaluation.csv")
dt.head(3)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1


In [7]:
calendar = pd.read_csv(dataPath + "/calendar.csv")
calendar.head(3)

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0


---
## 降低記憶體空間

將變數轉換為半精度 (遺失些許準確率，換來效率的提升)

In [4]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

In [5]:
dt = downcast_dtypes(dt)

---
## 資料前處理

僅使用數值資料，暫時移除其他不必要的資料

(單純用歷史資料建立 LSTM 模型)

In [8]:
dt = dt.T   
dt = dt[6 + startDay:]
dt.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30480,30481,30482,30483,30484,30485,30486,30487,30488,30489
d_351,0,0,0,2,0,0,0,24,3,2,...,0,9,1,0,11,0,0,1,0,0
d_352,0,0,0,0,0,0,0,9,0,2,...,0,5,4,0,8,0,1,2,0,0
d_353,0,0,0,4,2,0,0,2,1,1,...,0,15,2,0,3,0,1,2,0,0
d_354,0,1,0,2,0,0,0,7,1,0,...,0,5,1,0,3,0,0,0,0,0
d_355,0,0,0,1,2,0,0,0,0,0,...,0,7,1,0,1,0,1,1,0,0


---
## 建立特殊節日的Feature (Binary)

In [9]:
daysBeforeEvent = pd.DataFrame(np.zeros((1969,1)))

In [10]:
for x,y in calendar.iterrows():
    if((pd.isnull(calendar["event_name_1"][x])) == False):
           daysBeforeEvent[0][x-1] = 1 
            #if first day was an event this row will cause an exception because "x-1".
            #Since it is not i did not consider for now.

In [13]:
daysBeforeEventTest = daysBeforeEvent[1941:1969]
daysBeforeEvent = daysBeforeEvent[startDay:1941]

In [14]:
daysBeforeEvent.columns = ["oneDayBeforeEvent"]
daysBeforeEvent.index = dt.index

In [15]:
dt = pd.concat([dt, daysBeforeEvent], axis = 1)

---
## 資料標準化

In [18]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
dt_scaled = sc.fit_transform(dt)

---
## 拆分資料

In [20]:
X_train = []
y_train = []
for i in range(timesteps, 1941 - startDay):
    X_train.append(dt_scaled[i-timesteps:i])
    y_train.append(dt_scaled[i][0:30490]) 


In [21]:
X_train = np.array(X_train)
y_train = np.array(y_train)
print(X_train.shape)
print(y_train.shape)

(1577, 14, 30491)
(1577, 30490)


---
## 建構 LSTM Model

In [24]:
# Importing the Keras libraries and packages
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

# Initialising the RNN
regressor = Sequential()

# Adding the first LSTM layer and some Dropout regularisation
layer_1_units=50
regressor.add(LSTM(units = layer_1_units, return_sequences = True, input_shape = (X_train.shape[1], X_train.shape[2])))
regressor.add(Dropout(0.2))

# Adding a third LSTM layer and some Dropout regularisation
layer_3_units=400
regressor.add(LSTM(units = layer_3_units, return_sequences = True))
regressor.add(Dropout(0.2))

# Adding a third LSTM layer and some Dropout regularisation
layer_3_units=400
regressor.add(LSTM(units = layer_3_units))
regressor.add(Dropout(0.2))

# Adding the output layer
regressor.add(Dense(units = 30490))

# Compiling the RNN
regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

# Fitting the RNN to the Training set
epoch_no=32
batch_size_RNN=44
regressor.fit(X_train, y_train, epochs = epoch_no, batch_size = batch_size_RNN)

Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<keras.callbacks.History at 0x7ff9184d5dd8>

---
## 進行預測

In [25]:
inputs= dt[-timesteps:]
inputs = sc.transform(inputs)

In [26]:
X_test = []
X_test.append(inputs[0:timesteps])
X_test = np.array(X_test)
predictions = []

for j in range(timesteps,timesteps + 28):
#X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
    predicted_stock_price = regressor.predict(X_test[0,j - timesteps:j].reshape(1, timesteps, 30491))
    testInput = np.column_stack((np.array(predicted_stock_price), daysBeforeEventTest[0][1941 + j - timesteps]))
    X_test = np.append(X_test, testInput).reshape(1,j + 1,30491)
    predicted_stock_price = sc.inverse_transform(testInput)[:,0:30490]
    predictions.append(predicted_stock_price)

--
## 輸出預測檔案 (評分用)

In [27]:
import time

submission = pd.DataFrame(data=np.array(predictions).reshape(28,30490))

submission = submission.T
    
submission = pd.concat((submission, submission), ignore_index=True)

sample_submission = pd.read_csv(dataPath + "/sample_submission.csv")
    
idColumn = sample_submission[["id"]]
    
submission[["id"]] = idColumn  

cols = list(submission.columns)
cols = cols[-1:] + cols[:-1]
submission = submission[cols]

colsdeneme = ["id"] + [f"F{i}" for i in range (1,29)]

submission.columns = colsdeneme

currentDateTime = time.strftime("%d%m%Y_%H%M%S")

submission.to_csv("submission.csv", index=False)

In [28]:
submission.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,1.24594,1.032498,0.932818,0.947702,1.094236,1.536372,1.83886,1.435075,1.140363,...,1.091665,1.303577,1.851533,1.724813,1.273838,1.120029,1.068224,1.080767,1.224417,1.731668
1,HOBBIES_1_002_CA_1_validation,0.259987,0.256329,0.236861,0.236116,0.256735,0.319781,0.326544,0.227599,0.23182,...,0.234194,0.275499,0.360238,0.325392,0.268318,0.257325,0.239781,0.239185,0.267871,0.351715
2,HOBBIES_1_003_CA_1_validation,0.75574,0.524774,0.538674,0.638328,0.810554,1.118717,1.367254,1.010991,0.596196,...,0.795225,0.983962,1.367719,1.299272,0.764088,0.63159,0.683569,0.773542,0.925239,1.270028
3,HOBBIES_1_004_CA_1_validation,2.152712,1.132408,0.988428,0.957552,1.133381,1.869931,3.213064,2.96411,1.208111,...,1.023053,1.348001,2.390964,3.398544,1.717434,1.11355,1.016242,1.014427,1.233648,2.021408
4,HOBBIES_1_005_CA_1_validation,1.514396,1.480529,1.419549,1.374193,1.333392,1.287191,1.393267,1.496253,1.44859,...,1.354166,1.340399,1.354916,1.515732,1.470831,1.456721,1.407388,1.373834,1.360833,1.36333
