# MuchLearningSuchWow - LSTM - Preprocessing

This notebook contains the code we used to preprocess the data. The preprocessing code is based primarily on [this kernel](https://www.kaggle.com/bountyhunters/baseline-lstm-with-keras-0-7).

### Imports & Data Paths

In [None]:
import numpy as np
import pandas as pd
import pickle

In [None]:
inputPath = "input/m5-forecasting-accuracy/"
outputPath = "output/"

### Constants

In [None]:
startDay = 1000 # Number of days at start of data that will be ignored during training

# Calendar features
add_oneDayBeforeEvent = True
add_weekend = True
add_weekDay = False
add_snapDays = False
add_months = True

# One-hot encoding of item category and state
add_categoricalOneHot = False

# Rolling means
add_rollingMeans = False

### Loading Data

In [None]:
with open(outputPath + "/downcasted_sales_train_evaluation.pkl", "rb") as f:
    df_sales = pickle.load(f)
df_calendar = pd.read_csv(inputPath + "/calendar.csv")

In [None]:
df_sales.head()

### Preprocessing

In [None]:
# Transpose the training data so that rows represent days and columns represent items
df_sales = df_sales.T
print(df_sales.shape)
df_sales.head(11)

In [None]:
# Remove id, item_id, dept_id, cat_id, store_id and state_id rows, as well as the first "startDay" days
item_data = df_sales[:6] # Save rows for possible future use
df_sales = df_sales[6 + startDay:]
print(df_sales.shape)
df_sales.head()

### Additional Features 

#### Calendar Features

##### One day before event

In [None]:
# Create dataframe with zeros for 1969 days in the calendar
days_before_event = pd.DataFrame(np.zeros((1969,1)))

In [None]:
# Assign "1" to days before which there is an event_name_1 (event_name_2 never occurs without event_name_1, so it is redundant)
for x,y in df_calendar.iterrows():
    if((pd.isnull(df_calendar["event_name_1"][x])) == False and x != 0):
        days_before_event[0][x-1] = 1

###### Weekend

In [None]:
# Create dataframe with zeros for 1969 days in the calendar
weekend = pd.DataFrame(np.zeros((1969,1)))

In [None]:
# Assign "1" to saturdays and sundays
for x,y in df_calendar.iterrows():
    if(df_calendar["weekday"][x] == "Saturday" or df_calendar["weekday"][x] == "Sunday"):
        weekend[0][x] = 1

##### Day of the week

In [None]:
# Select week day column from the calendar
week_day = df_calendar[["wday"]]

##### Snap Days

In [None]:
# Select snap day columns from the calendar
snap_days = df_calendar[["snap_CA", "snap_TX", "snap_WI"]]

##### Months

In [None]:
# Create dataframe with zeros for 1969 days in the calendar, with one column for each month
months = pd.DataFrame(np.zeros((1969,12)))

In [None]:
# Assign "1" in the correct column for each day
for x,y in df_calendar.iterrows():
    months[df_calendar["month"][x] - 1][x] = 1

##### Combining Calendar Features 

In [None]:
features_to_add = []
feature_columns = []

if(add_oneDayBeforeEvent):
    features_to_add.append(days_before_event)
    feature_columns.append("one_day_before_event")
if(add_weekend):
    features_to_add.append(weekend)
    feature_columns.append("weekend")
if(add_weekDay):
    features_to_add.append(week_day)
    feature_columns.append("week_day")
if(add_snapDays):
    features_to_add.append(snap_days)
    feature_columns.extend(["snap_CA", "snap_TX", "snap_WI"])
if(add_months):
    features_to_add.append(months)
    feature_columns.extend(["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"])

additional_features = pd.DataFrame(np.concatenate(features_to_add, axis = 1))
print(additional_features.shape)

In [None]:
del df_calendar

##### Splitting and Merging

In [None]:
# Split additional_features into training, validation and evaluation parts
additional_features_train_valid = additional_features[startDay:1941]
additional_features_valid = additional_features[1913:1941]
additional_features_eval  = additional_features[1941:1969]
del additional_features

In [None]:
# Change column names of the train + validation part to match training data and match indices
additional_features_train_valid.columns = feature_columns
additional_features_train_valid.index = df_sales.index
additional_features_train_valid.head(10)

In [None]:
# Add additional_features_train_valid to the sales data to obtain the training & validation set
df_train_valid = pd.concat([df_sales, additional_features_train_valid], axis = 1)
df_train_valid.columns

#### Rolling Means

In [None]:
if(add_rollingMeans):
    rolling_mean = pd.DataFrame(df_sales.rolling(7).mean())
    rolling_mean = rolling_mean.fillna(0)
     
    rm_column_names = []
    for i in range(30490):
        rm_column_names.append("rm"+str(i))
    rolling_mean.columns = rm_column_names
    
    df_train_valid = pd.concat([df_train_valid, rolling_mean], axis = 1)
    print(df_train_valid.columns)

### One-Hot Encoding of Categorical Item Data

In [None]:
if(add_categoricalOneHot):
    # Create a one-hot encoding of categories and states for each of the 30490 columns, 
    # with 0s for the additional data columns and the same one-hot encoding for the rolling means
    item_data = item_data.iloc[[3,5]]
    item_data_one_hot = pd.DataFrame(np.zeros((6,30490)))
    unique_categories_states = np.unique(item_data.iloc[0]).tolist() + np.unique(item_data.iloc[1]).tolist()
    for i in range(0, item_data.shape[1]):
        item_data_one_hot[unique_categories_states.index(item_data[i][0])][i] = 1
        item_data_one_hot[unique_categories_states.index(item_data[i][1])][i] = 1
    item_data = pd.concat([item_data_one_hot, pd.DataFrame(np.zeros((6, additional_features_train_valid.shape[1])))], axis = 1)
    if(add_rollingMeans):
        item_data = pd.concat([item_data, item_data_one_hot], axis = 1)
else:
    # Set item data to an empty DataFrame with the correct width
    item_data = pd.DataFrame(np.zeros((0,30490+additional_features_train_valid.shape[1])))
    if(add_rollingMeans):
        item_data = pd.concat([item_data, pd.DataFrame(np.zeros((0,30490)))], axis = 1)
print(item_data.shape)

### Saving Results

In [None]:
# Save the preprocessed item data
with open(outputPath + "/item_data.pkl", "wb") as f:
    pickle.dump(item_data, f)

In [None]:
# Save the validation and evaluation parts of the additional features
with open(outputPath + "/additional_features_testing.pkl", "wb") as f:
    pickle.dump((additional_features_valid, additional_features_eval), f)

In [None]:
# Save the preprocessed (train + validation) data
with open(outputPath + "/preprocessed_train_valid_data.pkl", "wb") as f:
    pickle.dump(df_train_valid, f)