In [1]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Preprocessing the Rossmann Store Sales Dataset
Here we implement some feature engineering outlined by FastAI in [their example solution](https://github.com/fastai/fastai/blob/master/courses/dl1/lesson3-rossman.ipynb) to the [Kaggle Rossmann Store Sales competition](https://www.kaggle.com/c/rossmann-store-sales). We've simplified some sections and left out most of the documentation to keep things neat, so feel free to consult the original notebook for explanations of the feature engineering going on.

In [2]:
import os
import pandas as pd
import numpy as np
import cudf
import nvtabular as nvt
import dask_cudf

import nvtabular as nvt
from nvtabular.ops import Normalize, FillMissing, Categorify, LogOp, JoinExternal, Dropna, LambdaOp, JoinGroupby, Filter, HashBucket, FillMedian

In [3]:
INPUT_DATA_DIR = os.environ.get('INPUT_DATA_DIR', './rossmann')
OUTPUT_DATA_DIR = os.environ.get('OUTPUT_DATA_DIR', './data')

In [4]:
# ! mkdir -p $INPUT_DATA_DIR
# ! wget -O $INPUT_DATA_DIR/rossmann.tgz http://files.fast.ai/part2/lesson14/rossmann.tgz
# ! cd $INPUT_DATA_DIR && tar -xzf rossmann.tgz && ls

In [5]:
def read_table(table_name):
    return cudf.read_csv(os.path.join(INPUT_DATA_DIR, f'{table_name}.csv'))

train = read_table('train')
store = read_table('store')
store_states = read_table('store_states')
state_names = read_table('state_names')
googletrend = read_table('googletrend')
weather = read_table('weather')
test = read_table('test')

In [6]:
train.StateHoliday = train.StateHoliday!='0'
test.StateHoliday = test.StateHoliday!='0'

In [7]:
googletrend['Date'] = googletrend.week.str.split(' - ', expand=True, n=1)[0]
googletrend['State'] = googletrend.file.str.split('_', expand=True, n=2)[2]
googletrend['State'] = googletrend.State.where(googletrend['State']!='NI', 'HB,NI')
trend_de = googletrend.loc[googletrend.file == 'Rossmann_DE'].copy()

In [8]:
for df in (weather, googletrend, train, test, trend_de):
    #df.loc[:, 'Date'] = dask_cudf.to_datetime(df.Date)
    df['Date'] = df['Date'].astype('datetime64[s]')
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['Year'] = df.Date.dt.year.astype(str)+ '-01-01'
    df['Week']= (((df['Date'] - df['Year'].astype('datetime64[s]')).dt.days)/7).astype('int16') +1
    df['Week']=  df.Week.where(df['Week']!=53, 52)
    df['Year'] = df.Date.dt.year

In [9]:
def merge(df, right, left_on, right_on=None, suffix=None):
    df = df.merge(right, how='left', left_on=left_on, right_on=right_on or left_on, suffixes=('', suffix or '_y'))
    return df

In [10]:
def drop_cols(gdf):
    for c in gdf.columns:
        if c.endswith('_y'):
            if c in gdf.columns: gdf.drop(c, inplace=True, axis=1)
    return gdf

In [11]:
weather = merge(weather, state_names, 'file', right_on='StateName')
store = merge(store, store_states, 'Store')
train_df = merge(train, store, 'Store')
test_df = merge(test, store, 'Store')

In [12]:
train_df = merge(train_df, googletrend, ['State', 'Year', 'Week'])
test_df = merge(test_df, googletrend, ['State', 'Year', 'Week'])
train = test = googletrend = None

In [13]:
train_df = merge(train_df, trend_de, ['Year', 'Week'], right_on=['Year', 'Week'], suffix='_DE')
test_df = merge(test_df, trend_de, ['Year', 'Week'], right_on=['Year', 'Week'], suffix='_DE')

In [14]:
train_df = drop_cols(train_df)
train_df = merge(train_df, weather, ['State', 'Date'], right_on=['State', 'Date'], suffix='_y')
test_df = drop_cols(test_df)
test_df = merge(test_df, weather, ['State', 'Date'], right_on=['State', 'Date'])

In [15]:
train_df = drop_cols(train_df)
test_df = drop_cols(test_df)

In [16]:
for df in [train_df, test_df]:
    df['CompetitionOpenSinceYear'] = df.CompetitionOpenSinceYear.fillna(1900).astype(np.int32)
    df['CompetitionOpenSinceMonth'] = df.CompetitionOpenSinceMonth.fillna(1).astype(np.int32)
    df['Promo2SinceYear'] = df.Promo2SinceYear.fillna(1900).astype(np.int32)
    df['Promo2SinceWeek'] = df.Promo2SinceWeek.fillna(1).astype(np.int32)

In [17]:
for df in [train_df, test_df]:
    df['year']= df['CompetitionOpenSinceYear']
    df['month']= df['CompetitionOpenSinceMonth']
    df['day']= '15'
    df['CompetitionOpenSince'] = cudf.to_datetime(df[["year", "month", "day"]])
    df["CompetitionDaysOpen"] = (df['Date'] - df['CompetitionOpenSince']).dt.days

In [18]:
for df in [train_df, test_df]:
    df.loc[df.CompetitionDaysOpen<0, "CompetitionDaysOpen"] = 0
    df.loc[df.CompetitionOpenSinceYear<1990, "CompetitionDaysOpen"] = 0

In [19]:
for df in [train_df, test_df]:
    df["CompetitionMonthsOpen"] = df["CompetitionDaysOpen"]//30
    df.loc[df.CompetitionMonthsOpen>24, "CompetitionMonthsOpen"] = 24

In [20]:
for df in [train_df, test_df]:
    df['Promo2SinceYear_tmp']= df.Promo2SinceYear.astype(str)+ '-01-01'
    dt = cudf.to_datetime(df.Promo2SinceYear_tmp, format='%Y').astype(np.int64) // 10**9
    dt += 7*24*3600*df.Promo2SinceWeek
    df["Promo2Since"] = cudf.to_datetime(dt*10**9)
    df["Promo2Days"] = (df['Date']- df["Promo2Since"]).dt.days

In [21]:
for df in [train_df, test_df]:
    df.loc[df.Promo2Days<0, "Promo2Days"] = 0
    df.loc[df.Promo2SinceYear<1990, "Promo2Days"] = 0
    df["Promo2Weeks"] = df["Promo2Days"]//7
    df.loc[df.Promo2Weeks<0, "Promo2Weeks"] = 0
    df.loc[df.Promo2Weeks>25, "Promo2Weeks"] = 25
    df.Promo2Weeks.unique()

In [22]:
df = train_df.append(test_df, ignore_index=True)

In [23]:
#let's drop these dummy columns.
df = df.drop(['year', 'month', 'day', 'Promo2SinceYear_tmp'], inplace=True)
train_df = train_df.drop(['year', 'month', 'day', 'Promo2SinceYear_tmp'], inplace=True)
test_df = test_df.drop(['year', 'month', 'day', 'Promo2SinceYear_tmp'], inplace=True)

In [24]:
# # cast SchoolHoliday to int32.
df['SchoolHoliday'] = df['SchoolHoliday'].astype('int32')

In [25]:
# ops: masking, ffill, bfill, timedelta
df = df.sort_values(by=['Store', 'Date'])
# We convert cudf dataframe to the pandas df be able to do `bfill` and `ffill` operations below.
df = df.to_pandas()

This is modififed version of the original code from https://github.com/fastai/course-v3/blob/master/nbs/dl1/rossman_data_clean.ipynb
Still not working as intended with cudf. The `get_elapsed` function is defined for cumulative counting across a sorted dataframe. Given a particular field `fld` to monitor, this function will start tracking time since the last occurrence of that field. When the field is seen again, the counter is set to zero.

Upon initialization, this will result in datetime na's until the field is encountered. This is reset every time a new store is seen. We'll see how to use this shortly.

In [26]:
# first build a mask indicating where stores start and end
first_indices = df.Store.diff() != 0
last_indices = df.Store.diff().iloc[1:].append(pd.Series([1]))
last_indices.index = first_indices.index
idx_mask = ~(first_indices | last_indices)

event_fields = ['SchoolHoliday', 'StateHoliday', 'Promo']
for field in event_fields:
    # use the mask from above to mask save dates from the start and end
    # of a given store's range, as well as all dates that have an event
    df['tmp'] = df.Date
    df.loc[(df[field] == 0) & idx_mask, 'tmp'] = np.nan
    # then use ffill and bbfill to give the input to the time delta
    df['After'+field] = df.tmp.ffill()
    df['Before'+field] = df.tmp.bfill()

    # compute deltas between bfilled and ffilled dates and the current date
    df['After'+field] = (df['Date'] - df['After'+field]).astype('timedelta64[D]')
    df['Before'+field] = (df['Before'+field] - df['Date']).astype('timedelta64[D]')

# get rid of our dummy column
df = df.drop(columns=['tmp'])

#let's convert pandas back to cudf df
df = cudf.from_pandas(df)

In [27]:
df = df.set_index("Date")

In [28]:
event_fields = ['SchoolHoliday', 'StateHoliday', 'Promo']
bwd = df[['Store']+event_fields].sort_index().groupby("Store").rolling(7, min_periods=1).sum()
fwd = df[['Store']+event_fields].sort_index(ascending=False).groupby("Store").rolling(7, min_periods=1).sum()

In [29]:
for d in (bwd, fwd):
    d.drop('Store', 1, inplace=True)
    d.reset_index(inplace=True)

In [30]:
df.reset_index(inplace=True)

In [31]:
df = df.merge(bwd, left_on=['Date', 'Store'], right_on=['Date', 'Store'], how='left', suffixes=['', '_bw'])
df = df.merge(fwd, left_on=['Date', 'Store'], right_on=['Date', 'Store'], how= 'left', suffixes=['', '_fw'])

In [None]:
train_df = merge(train_df, df, ['Store', 'Date'], right_on=['Store', 'Date'])
test_df = merge(test_df, df, ['Store', 'Date'], right_on=['Store', 'Date'])

In [33]:
train_df = drop_cols(train_df)
test_df = drop_cols(test_df)

In [34]:
train_df = train_df[train_df.Sales != 0]
train_df = train_df.sort_values(by='Date', ascending=True)
train_df=train_df.reset_index(drop=True)
test_df=test_df.reset_index(drop=True)

In [35]:
# create a validation dataset of the same duration as test set
cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()
cut

42171

In [36]:
num_valid = cut
valid_df = train_df[-num_valid:]
train_df = train_df[:-num_valid]

In [37]:
!mkdir -p $OUTPUT_DATA_DIR

train_df.to_csv(os.path.join(OUTPUT_DATA_DIR, 'train.csv'), index=False)
valid_df.to_csv(os.path.join(OUTPUT_DATA_DIR, 'valid.csv'), index=False)
test_df.to_csv(os.path.join(OUTPUT_DATA_DIR, 'test.csv'), index=False)