In [None]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Preprocessing the Rossmann Store Sales Dataset
Here we implement some feature engineering outlined by FastAI in [their example solution](https://github.com/fastai/fastai/blob/master/courses/dl1/lesson3-rossman.ipynb) to the [Kaggle Rossmann Store Sales competition](https://www.kaggle.com/c/rossmann-store-sales). We've simplified some sections and left out most of the documentation to keep things neat, so feel free to consult the original notebook for explanations of the feature engineering going on.

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [3]:
import pandas as pd
import numpy as np
import cudf
import nvtabular as nvt
import dask_cudf

import nvtabular as nvt
from nvtabular.ops import Normalize, FillMissing, Categorify, LogOp, JoinExternal, Dropna, LambdaOp, JoinGroupby, Filter, HashBucket, FillMedian
from nvtabular.column_similarity import ColumnSimilarity

In [4]:
# from dask.distributed import Client
# from dask_cuda import LocalCUDACluster
# cluster = LocalCUDACluster(device_memory_limit="16GB")# Do this second to allow spilling from GPU memory to host memory in the workers
# client = Client(cluster)
# client

In [6]:
INPUT_DATA_DIR = os.environ.get('INPUT_DATA_DIR', './rossmann')
OUTPUT_DATA_DIR = os.environ.get('OUTPUT_DATA_DIR', './data')

In [7]:
# ! mkdir -p $INPUT_DATA_DIR
# ! wget -O $INPUT_DATA_DIR/rossmann.tgz http://files.fast.ai/part2/lesson14/rossmann.tgz
# ! cd $INPUT_DATA_DIR && tar -xzf rossmann.tgz && ls

In [8]:
def read_table(table_name):
    return cudf.read_csv(os.path.join(INPUT_DATA_DIR, f'{table_name}.csv'))

train = read_table('train')
store = read_table('store')
store_states = read_table('store_states')
state_names = read_table('state_names')
googletrend = read_table('googletrend')
weather = read_table('weather')
test = read_table('test')

In [9]:
train.StateHoliday = train.StateHoliday!='0'
test.StateHoliday = test.StateHoliday!='0'

In [10]:
googletrend['Date'] = googletrend.week.str.split(' - ', expand=True, n=1)[0]
googletrend['State'] = googletrend.file.str.split('_', expand=True, n=2)[2]
googletrend['State'] = googletrend.State.where(googletrend['State']!='NI', 'HB,NI')
trend_de = googletrend.loc[googletrend.file == 'Rossmann_DE'].copy()

In [11]:
for df in (weather, googletrend, train, test, trend_de):
    #df.loc[:, 'Date'] = dask_cudf.to_datetime(df.Date)
    df['Date'] = df['Date'].astype('datetime64[s]')
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['Year'] = df.Date.dt.year.astype(str)+ '-01-01'
    df['Week']= (((df['Date'] - df['Year'].astype('datetime64[s]')).dt.days)/7).astype('int16') +1
    df['Week']=  df.Week.where(df['Week']!=53, 52)
    df['Year'] = df.Date.dt.year

In [17]:
def merge(df, right, left_on, right_on=None, suffix=None):
    df = df.merge(right, how='left', left_on=left_on, right_on=right_on or left_on, suffixes=('', suffix or '_y'))
    return df

In [18]:
weather = merge(weather, state_names, 'file', right_on='StateName')
store = merge(store, store_states, 'Store')
train_df = merge(train, store, 'Store')
test_df = merge(test, store, 'Store')

In [22]:
train_df = merge(train_df, googletrend, ['State', 'Year', 'Week'])
test_df = merge(test_df, googletrend, ['State', 'Year', 'Week'])

train = test = googletrend = None

In [25]:
train_df = merge(train_df, trend_de, ['Year', 'Week'], right_on=['Year', 'Week'], suffix='_DE')
test_df = merge(test_df, trend_de, ['Year', 'Week'], right_on=['Year', 'Week'], suffix='_DE')

In [33]:
def drop_cols(gdf):
    for c in gdf.columns:
        if c.endswith('_y'):
            if c in gdf.columns: gdf.drop(c, inplace=True, axis=1)
    return gdf

In [36]:
train_df = drop_cols(train_df)
train_df = merge(train_df, weather, ['State', 'Date'], right_on=['State', 'Date'], suffix='_y')
test_df = drop_cols(test_df)
test_df = merge(test_df, weather, ['State', 'Date'], right_on=['State', 'Date'])

In [47]:
train_df = drop_cols(train_df)
test_df = drop_cols(test_df)

In [50]:
for df in [train_df, test_df]:
    df['CompetitionOpenSinceYear'] = df.CompetitionOpenSinceYear.fillna(1900).astype(np.int32)
    df['CompetitionOpenSinceMonth'] = df.CompetitionOpenSinceMonth.fillna(1).astype(np.int32)
    df['Promo2SinceYear'] = df.Promo2SinceYear.fillna(1900).astype(np.int32)
    df['Promo2SinceWeek'] = df.Promo2SinceWeek.fillna(1).astype(np.int32)

In [51]:
train_df.head(2)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Month,...,Mean_VisibilityKm,Min_VisibilitykM,Max_Wind_SpeedKm_h,Mean_Wind_SpeedKm_h,Max_Gust_SpeedKm_h,Precipitationmm,CloudCover,Events,WindDirDegrees,StateName
0,900,1,2015-06-22,6374,619,1,0,False,0,6,...,10.0,10.0,35,16,55.0,0.0,7.0,Rain,243,Bayern
1,901,1,2015-06-22,6303,588,1,0,False,0,6,...,11.0,5.0,19,10,,0.51,6.0,Rain,244,SchleswigHolstein


In [58]:
for df in [train_df, test_df]:
    df['year']= df['CompetitionOpenSinceYear']
    df['month']= df['CompetitionOpenSinceMonth']
    df['day']= '15'
    df['CompetitionOpenSince'] = cudf.to_datetime(df[["year", "month", "day"]])
    df["CompetitionDaysOpen"] = (df['Date'] - df['CompetitionOpenSince']).dt.days

In [63]:
for df in [train_df, test_df]:
    df.loc[df.CompetitionDaysOpen<0, "CompetitionDaysOpen"] = 0
    df.loc[df.CompetitionOpenSinceYear<1990, "CompetitionDaysOpen"] = 0

In [69]:
for df in [train_df, test_df]:
    df["CompetitionMonthsOpen"] = df["CompetitionDaysOpen"]//30
    df.loc[df.CompetitionMonthsOpen>24, "CompetitionMonthsOpen"] = 24

In [83]:
for df in [train_df, test_df]:
    df['Promo2SinceYear_tmp']= df.Promo2SinceYear.astype(str)+ '-01-01'
    dt = cudf.to_datetime(df.Promo2SinceYear_tmp, format='%Y').astype(np.int64) // 10**9
    dt += 7*24*3600*df.Promo2SinceWeek
    df["Promo2Since"] = cudf.to_datetime(dt*10**9)
    df["Promo2Days"] = (df['Date']- df["Promo2Since"]).dt.days

In [97]:
for df in [train_df, test_df]:
    df.loc[df.Promo2Days<0, "Promo2Days"] = 0
    df.loc[df.Promo2SinceYear<1990, "Promo2Days"] = 0
    df["Promo2Weeks"] = df["Promo2Days"]//7
    df.loc[df.Promo2Weeks<0, "Promo2Weeks"] = 0
    df.loc[df.Promo2Weeks>25, "Promo2Weeks"] = 25
    df.Promo2Weeks.unique()

In [156]:
df = train_df.append(test_df, ignore_index=True)

In [158]:
#let's drop these dummy columns.
df.drop(['year', 'month', 'day', 'Promo2SinceYear_tmp'], inplace=True)

In [174]:
df['SchoolHoliday'] = df['SchoolHoliday'].astype('int32')
# convert stateholiday values to 0-1
df['StateHoliday'] = df['StateHoliday'] *1

This is modififed version of the original code from https://github.com/fastai/course-v3/blob/master/nbs/dl1/rossman_data_clean.ipynb
Still not working as intended with cudf. The `get_elapsed` function is defined for cumulative counting across a sorted dataframe. Given a particular field `fld` to monitor, this function will start tracking time since the last occurrence of that field. When the field is seen again, the counter is set to zero.

Upon initialization, this will result in datetime na's until the field is encountered. This is reset every time a new store is seen. We'll see how to use this shortly.

In [279]:
def get_elapsed(fld, pre):
    day1 = np.timedelta64(1, 'D')
    print('day1:', day1)
    last_date = np.datetime64()
    last_store = 0
    res = []
    for s,v,d in zip(df.Store.values,df[fld].values, df.Date.values):
        if s != last_store:
            print('s', s)
            last_date = np.datetime64()
            last_store = s
        if v: 
            print('v', v)
            last_date = d
            print((cudf.to_datetime(d)-cudf.to_datetime(last_date)).astype('timedelta64[D]'))
        res.append(((cudf.to_datetime(d)-cudf.to_datetime(last_date)).astype('timedelta64[D]') / day1))
        print(res)
    df[pre+fld] = res

In [None]:
# getting error here most likely due to `nans`
fld = 'StateHoliday'
df = df.sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')

In [239]:
fld = 'SchoolHoliday'
df = df.sort_values(['Store', 'Date'])
#df = df.iloc[:10, :].sort_values(['Store', 'Date'])
get_elapsed(fld, 'After')
df = df.sort_values(['Store', 'Date'], ascending=[True, False])
get_elapsed(fld, 'Before')

In [109]:
# ops: masking, ffill, bfill, timedelta
df = df.sort_values(by=['Store', 'Date'])
# first build a mask indicating where stores start and end
first_indices = df.Store.diff() != 0
last_indices = df.Store.diff().iloc[1:].append(cudf.Series([1]))
last_indices.index = first_indices.index
idx_mask = ~(first_indices | last_indices)

In [None]:
# THIS CODE DOES NOT WORK BCS CUDF DOES NOT SUPPORT FFILL AND BFILL METHODS

# event_fields = ['SchoolHoliday', 'StateHoliday', 'Promo']
# for field in event_fields:
#     # use the mask from above to mask save dates from the start and end
#     # of a given store's range, as well as all dates that have an event
#     df['tmp'] = df.Date
#     #df.loc[(df[field] == 0) & idx_mask, 'tmp'] = np.nan
#     df[(df[field] == 0) & idx_mask]['tmp']= np.nan

#     # then use ffill and bbfill to give the input to the time delta
#     df['After'+field] = df.tmp.ffill()
#     df['Before'+field] = df.tmp.bfill()

#     # compute deltas between bfilled and ffilled dates and the current date
#     df['After'+field] = (df['Date'] - df['After'+field]).astype('timedelta64[D]')
#     df['Before'+field] = (df['Before'+field] - df['Date']).astype('timedelta64[D]')

# # get rid of our dummy column
# df = df.drop(columns=['tmp'])

In [23]:
df = df.set_index("Date")
bwd = df[['Store']+event_fields].sort_index().groupby("Store").rolling(7, min_periods=1).sum()
fwd = df[['Store']+event_fields].sort_index(ascending=False).groupby("Store").rolling(7, min_periods=1).sum()

In [24]:
for d in (bwd, fwd):
    d.drop('Store', 1, inplace=True)
    d.reset_index(inplace=True)

In [25]:
df.reset_index(inplace=True)

In [26]:
for d, suffix in zip([bwd, fwd], ['_bw', '_fw']):
    df = df.left.merge(d, ['Store', 'Date'], suffix=suffix)

In [27]:
train_df = train_df.left.merge(df, ['Store', 'Date'])
test_df = test_df.left.merge(df, ['Store', 'Date'])

In [28]:
train_df.shape, test_df.shape

((1017209, 75), (41088, 75))

In [29]:
train_df = train_df[train_df.Sales != 0]

In [30]:
train_df = train_df.sort_values(by='Date', ascending=True)

In [31]:
# create a validation dataset of the same duration as test set
cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()
cut

41254

In [32]:
num_valid = cut
valid_df = train_df[-num_valid:]
train_df = train_df[:-num_valid]

In [35]:
!mkdir -p $OUTPUT_DATA_DIR

train_df.to_csv(os.path.join(OUTPUT_DATA_DIR, 'train.csv'), index=False)
valid_df.to_csv(os.path.join(OUTPUT_DATA_DIR, 'valid.csv'), index=False)
test_df.to_csv(os.path.join(OUTPUT_DATA_DIR, 'test.csv'), index=False)