In [15]:
##For Data consideration see Kaggle data tab.
##Predict for 
#id,date,store_nbr,item_nbr,onpromotion
#125497040,2017-08-16,1,96995,False
#...
#128867503,2017-08-31,54,2134244,False


#Inspired on Kaggle
#(https://www.kaggle.com/tarobxl/how-the-test-set-is-split-lb-0-532)

#import a dataset
import pandas as pd
from datetime import timedelta

dtypes = {'id':'int64', 'item_nbr':'int32', 'store_nbr':'int8'}

train = pd.read_csv('/Users/Koos/Downloads/DataFav/train.csv', usecols=[1,2,3,4], dtype=dtypes, parse_dates=['date'],
                    skiprows=range(1, 101688779) #Skip dates before 2017-01-01
                    )

In [16]:
train.loc[(train.unit_sales<0),'unit_sales'] = 0 # eliminate negatives
train['unit_sales'] =  train['unit_sales'].apply(pd.np.log1p) #logarithm conversion
train['dow'] = train['date'].dt.dayofweek # dow is day of week - KtB


In [17]:
# creating records for all items, in all markets on all dates
# for correct calculation of daily unit sales averages.
u_dates = train.date.unique()
u_stores = train.store_nbr.unique()
u_items = train.item_nbr.unique()
train.set_index(['date', 'store_nbr', 'item_nbr'], inplace=True)
train = train.reindex(
    pd.MultiIndex.from_product(
        (u_dates, u_stores, u_items),
        names=['date','store_nbr','item_nbr']
    )
)

In [18]:
#clean up
del u_dates, u_stores, u_items

In [19]:
# Fill NaNs
train.loc[:, 'unit_sales'].fillna(0, inplace=True)
train.reset_index(inplace=True) # reset index and restoring unique columns  
lastdate = train.iloc[train.shape[0]-1].date

In [20]:
test = pd.read_csv('/Users/Koos/Downloads/DataFav/test.csv', usecols=[0,1,2,3,4], dtype=dtypes, parse_dates=['date'])
test['dow'] = test['date'].dt.dayofweek
test.count()

id             3370464
date           3370464
store_nbr      3370464
item_nbr       3370464
onpromotion    3370464
dow            3370464
dtype: int64

In [21]:
ma_dw = train[['item_nbr','store_nbr','dow','unit_sales']].groupby(['item_nbr','store_nbr','dow'])['unit_sales'].mean().to_frame('madw')
ma_dw.reset_index(inplace=True)
ma_wk = ma_dw[['item_nbr','store_nbr','madw']].groupby(['store_nbr', 'item_nbr'])['madw'].mean().to_frame('mawk')
ma_wk.reset_index(inplace=True)

In [22]:
#Moving Averages
#.to_frame is a renaming function of pandas

ma_is = train[['item_nbr','store_nbr','unit_sales']].groupby(['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais226')
for i in [112,56,28,14,7,3,1]:
    tmp = train[train.date>lastdate-timedelta(int(i))]
    tmpg = tmp.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais'+str(i))
    ma_is = ma_is.join(tmpg, how='left')


In [23]:
ma_is['mais']=ma_is.median(axis=1)  ##Which median is used here?

In [24]:
ma_is.reset_index(inplace=True) #delete multi-indexer

In [25]:
## make a test set. A lot of NAN's now.

test = pd.merge(test, ma_is, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_wk, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_dw, how='left', on=['item_nbr','store_nbr','dow'])

In [26]:
#clean up
del ma_is, ma_wk, ma_dw

In [27]:
#Forecasting Test
test['unit_sales'] = test.mais
pos_idx = test['mawk'] > 0
test_pos = test.loc[pos_idx]
test.loc[pos_idx, 'unit_sales'] = test_pos['mais'] * test_pos['madw'] / test_pos['mawk']
test.loc[:, "unit_sales"].fillna(0, inplace=True)
test['unit_sales'] = test['unit_sales'].apply(pd.np.expm1)

In [28]:
test.loc[test['onpromotion'] == True, 'unit_sales'] = test.loc[test['onpromotion'] == True, 'unit_sales'] * 1.14

In [29]:
test[['id','unit_sales']].to_csv('ma8dwof.csv.gz', index=False, float_format='%.3f', compression='gzip')