## Sales forecasting for grocery store

In [None]:
import pandas as pd
import numpy as np
from sklearn import cross_validation
import xgboost as xgb

In [None]:
np.random.seed(1234)

In [None]:
train = pd.read_csv("../input/sales.csv", index_col = False)

In [None]:
train['sales']= train['quantity']*train['price'].astype(np.float64)

In [None]:
train = train.fillna(0.)

In [None]:
train['age_band']  = train['age_band'].mask(train['age_band']=='A', np.mean([18,25]).astype(np.int))

In [None]:
train['age_band']  = train['age_band'].mask(train['age_band']=='B', np.mean([25,29]).astype(np.int))

In [None]:
train['age_band']  = train['age_band'].mask(train['age_band']=='C', np.mean([30,35]).astype(np.int))

In [None]:
train['age_band']  = train['age_band'].mask(train['age_band']=='D', np.mean([35,39]).astype(np.int))

In [None]:
train['age_band']  = train['age_band'].mask(train['age_band']=='E', np.mean([40,44]).astype(np.int))

In [None]:
train['age_band']  = train['age_band'].mask(train['age_band']=='F', np.mean([45,49]).astype(np.int))

In [None]:
train['age_band']  = train['age_band'].mask(train['age_band']=='G', np.mean([50,54]).astype(np.int))

In [None]:
train['age_band']  = train['age_band'].mask(train['age_band']=='H', np.mean([55,59]).astype(np.int))

In [None]:
train['age_band']  = train['age_band'].mask(train['age_band']=='I', np.mean([60,64]).astype(np.int))

In [None]:
train['age_band']  = train['age_band'].mask(train['age_band']=='J', np.mean([65,70]).astype(np.int))

In [None]:
train['age_band']  = train['age_band'].mask(train['age_band']=='K', np.mean([71,80]).astype(np.int))

In [None]:
train['age_band'] = train['age_band'].astype(str).astype(int)

In [None]:
train['res_area'] = train['res_area'].mask(train['res_area']=='A', int(105))

In [None]:
train['res_area'] = train['res_area'].mask(train['res_area']=='B', int(106))

In [None]:
train['res_area'] = train['res_area'].mask(train['res_area']=='C', int(110))

In [None]:
train['res_area'] = train['res_area'].mask(train['res_area']=='D', int(114))

In [None]:
train['res_area'] = train['res_area'].mask(train['res_area']=='E', int(115))

In [None]:
train['res_area'] = train['res_area'].mask(train['res_area']=='F', int(221))

In [None]:
train['res_area'] = train['res_area'].mask(train['res_area']=='G', int(300))

In [None]:
train['res_area'] = train['res_area'].mask(train['res_area']=='H', int(-1000))

In [None]:
train['res_area'] = train['res_area'].astype(str).astype(int)

In [None]:
train['prod_id']=train['prod_id'].astype(float)

# If the zip code is unknown we add a new feature which can distinguish from others, as this is treated as not normal

In [None]:
train['res_area_unknown'] = train['res_area'].apply(lambda x: int(1) if x =='H' else int(0))

In [None]:
train['year'] = train.trans_date.apply(lambda x: x.split('/')[2])
train['year'] = train['year'].astype(int)
train['month'] = train.trans_date.apply(lambda x: x.split('/')[0])
train['month'] = train['month'].astype(int)
train['day'] = train.trans_date.apply(lambda x: x.split('/')[1])
train['day'] = train['day'].astype(int)

## Basic sales data analysis 

In [None]:
from pandas.tools import plotting
import numpy as np
import random

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#%matplotlib nbagg

import time
import sys
from datetime import date, datetime

In [None]:
sf=train
dow = sf[(sf['sales']!=0)].groupby(['day']).sales.mean()
dow.plot('bar')

In [None]:
_, ax = plt.subplots(2,2)
sf[(sf['sales']!=0)].boxplot(ax=ax[0][0], column='sales', by='day')
sf[(sf['sales']!=0) ].boxplot(ax=ax[0][1], column='sales', by='day')
sf[(sf['sales']!=0) ].boxplot(ax=ax[1][0], column='sales', by='day')
sf[(sf['sales']!=0)].boxplot(ax=ax[1][1], column='sales', by='day')

In [None]:
sf['res_area'].hist(bins=100)

In [None]:
sf['age_band'].hist(bins=100)

In [None]:
sns.lmplot(x='day', y='sales', data=sf[(sf['sales'] !=0)], 
           col='day', col_wrap=2);

In [None]:
X_train, X_test = cross_validation.train_test_split(train, test_size=0.01)

In [None]:
import pandas as pd
import numpy as np
from sklearn import cross_validation
import xgboost as xgb


def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w


def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe


def rmspe_xg(yhat, y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

In [None]:
params = {"objective": "reg:linear",
          "eta": 0.3,
          "max_depth": 8,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "silent": 1
          }
num_trees = 300

In [None]:
val_size = 100000

In [None]:
y_train = X_train.pop('sales')
y_test = X_test.pop('sales')
remove_date = X_train.pop('trans_date')

In [None]:
X_train.columns.values

In [None]:
features =[]
features.append('cust_id')
features.append('age_band')
features.append('res_area')

features.append('prod_cat')
features.append('prod_id')
features.append('asset')

features.append('res_area_unknown')

features.append('year')
features.append('month')
features.append('day')

In [None]:
dtrain = xgb.DMatrix(X_train[features], np.log(y_train + 1))
dvalid = xgb.DMatrix(X_test[features], np.log(y_test + 1))

In [None]:
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True)

In [None]:
%matplotlib inline
import seaborn as sns
sns.set(font_scale = 1.5)

In [None]:
xgb.plot_importance(gbm)

In [None]:
print("Validating")
train_probs = gbm.predict(xgb.DMatrix(X_test[features]))
indices = train_probs < 0
train_probs[indices] = 0
error = rmspe(np.exp(train_probs) - 1, y_test)
print('error', error)

In [None]:
print("Saving forecast predictions")
submission = pd.DataFrame({"cust_id": X_test["cust_id"], "Sales": np.exp(train_probs) - 1})
submission.to_csv("../results/submission.csv", index=False)

# Prediction done for cust_id