In [1]:
import os

In [2]:
os.chdir('..')

In [4]:
pwd

'/Users/A.IVA/Documents/jupyter_notebooks/coursera_and_blogs/rossmann_competition'

### 1 Idea f01 `https://www.kaggle.com/xwxw2929/rossmann-sales-top1`

Based on https://www.kaggle.com/justdoit/rossmann-store-sales/xgboost-in-python-with-rmspe/code    
Public Score :  0.11389    
Private Validation Score :  0.096959     

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import operator
import matplotlib
matplotlib.use("Agg") #Needed to save figures
import matplotlib.pyplot as plt


In [8]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

# Gather some features
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear

    # CompetionOpen en PromoOpen from https://www.kaggle.com/ananya77041/rossmann-store-sales/randomforestpython/code
    # Calculate time competition open time in months
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data

In [9]:
data_dir = os.path.join('..', 'data', 'rossmann-store-sales', 'source')

In [10]:
print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv(os.path.join(data_dir, "train.csv"), parse_dates=[2], dtype=types)
test = pd.read_csv(os.path.join(data_dir, "test.csv"), parse_dates=[3], dtype=types)
store = pd.read_csv(os.path.join(data_dir, "store.csv"))

print("Assume store open, if not provided")
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero. Simplifies calculation of rmspe")
train = train[train["Sales"] > 0]

print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

features = []

print("augment features")
build_features(features, train)
build_features([], test)
print(features)

print('training data processed')

params = {"objective": "reg:squarederror",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 300


Load the training, test and store data using pandas
Assume store open, if not provided
Consider only open stores for training. Closed stores wont count into the score.
Use only Sales bigger then zero. Simplifies calculation of rmspe
Join with store
augment features
['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']
training data processed


In [12]:
train.head(2)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth
0,1,4,2015-07-31,5263,555,1,1,0,1.0,3,...,0.0,,2015,7,31,31,82.0,0.0,Jul,0
1,1,3,2015-07-30,5020,546,1,1,0,1.0,3,...,0.0,,2015,7,30,31,82.0,0.0,Jul,0


In [17]:
train['StateHoliday'] = train['StateHoliday'].astype(int)
train['Assortment'] = train['Assortment'].astype(int)

In [20]:
print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

Train a XGBoost model
[0]	train-rmse:5.79377	eval-rmse:5.79408	train-rmspe:0.99684	eval-rmspe:0.996841
Multiple eval metrics have been passed: 'eval-rmspe' will be used for early stopping.

Will train until eval-rmspe hasn't improved in 100 rounds.
[1]	train-rmse:4.06353	eval-rmse:4.06502	train-rmspe:0.981462	eval-rmspe:0.981484
[2]	train-rmse:2.85371	eval-rmse:2.85572	train-rmspe:0.938004	eval-rmspe:0.938079
[3]	train-rmse:2.00973	eval-rmse:2.0122	train-rmspe:0.856688	eval-rmspe:0.856823
[4]	train-rmse:1.42415	eval-rmse:1.42679	train-rmspe:0.743816	eval-rmspe:0.743606
[5]	train-rmse:1.01979	eval-rmse:1.02292	train-rmspe:0.619458	eval-rmspe:0.618166
[6]	train-rmse:0.743183	eval-rmse:0.746465	train-rmspe:0.504781	eval-rmspe:0.50062
[7]	train-rmse:0.555348	eval-rmse:0.558586	train-rmspe:0.412774	eval-rmspe:0.404574
[8]	train-rmse:0.436654	eval-rmse:0.439691	train-rmspe:0.352529	eval-rmspe:0.33875
[9]	train-rmse:0.364029	eval-rmse:0.366774	train-rmspe:0.320635	eval-rmspe:0.300824
[10]	tra

[96]	train-rmse:0.100283	eval-rmse:0.106113	train-rmspe:0.121369	eval-rmspe:0.11208
[97]	train-rmse:0.10008	eval-rmse:0.105962	train-rmspe:0.12118	eval-rmspe:0.111944
[98]	train-rmse:0.099647	eval-rmse:0.105611	train-rmspe:0.12079	eval-rmspe:0.111545
[99]	train-rmse:0.09952	eval-rmse:0.105499	train-rmspe:0.120676	eval-rmspe:0.111438
[100]	train-rmse:0.099342	eval-rmse:0.105325	train-rmspe:0.120509	eval-rmspe:0.111111
[101]	train-rmse:0.099118	eval-rmse:0.105145	train-rmspe:0.118589	eval-rmspe:0.110912
[102]	train-rmse:0.098825	eval-rmse:0.104926	train-rmspe:0.118301	eval-rmspe:0.110709
[103]	train-rmse:0.098477	eval-rmse:0.104685	train-rmspe:0.117965	eval-rmspe:0.110434
[104]	train-rmse:0.097956	eval-rmse:0.104264	train-rmspe:0.117506	eval-rmspe:0.109986
[105]	train-rmse:0.097587	eval-rmse:0.103986	train-rmspe:0.117133	eval-rmspe:0.109681
[106]	train-rmse:0.097222	eval-rmse:0.10365	train-rmspe:0.116642	eval-rmspe:0.109346
[107]	train-rmse:0.096864	eval-rmse:0.103382	train-rmspe:0.11628

[192]	train-rmse:0.083095	eval-rmse:0.094592	train-rmspe:0.091397	eval-rmspe:0.09889
[193]	train-rmse:0.082988	eval-rmse:0.094531	train-rmspe:0.091251	eval-rmspe:0.098829
[194]	train-rmse:0.082916	eval-rmse:0.09446	train-rmspe:0.091166	eval-rmspe:0.098754
[195]	train-rmse:0.082802	eval-rmse:0.094443	train-rmspe:0.091049	eval-rmspe:0.098732
[196]	train-rmse:0.08268	eval-rmse:0.094401	train-rmspe:0.090821	eval-rmspe:0.098704
[197]	train-rmse:0.082589	eval-rmse:0.094326	train-rmspe:0.090736	eval-rmspe:0.098604
[198]	train-rmse:0.082479	eval-rmse:0.094316	train-rmspe:0.090623	eval-rmspe:0.098588
[199]	train-rmse:0.082382	eval-rmse:0.094267	train-rmspe:0.090531	eval-rmspe:0.098552
[200]	train-rmse:0.082305	eval-rmse:0.094238	train-rmspe:0.090448	eval-rmspe:0.098546
[201]	train-rmse:0.08216	eval-rmse:0.094119	train-rmspe:0.090315	eval-rmspe:0.098425
[202]	train-rmse:0.082066	eval-rmse:0.094059	train-rmspe:0.090219	eval-rmspe:0.098363
[203]	train-rmse:0.081991	eval-rmse:0.094009	train-rmspe:0

[288]	train-rmse:0.074729	eval-rmse:0.091121	train-rmspe:0.079944	eval-rmspe:0.095291
[289]	train-rmse:0.07467	eval-rmse:0.091115	train-rmspe:0.079873	eval-rmspe:0.095278
[290]	train-rmse:0.074626	eval-rmse:0.091135	train-rmspe:0.07982	eval-rmspe:0.095293
[291]	train-rmse:0.074564	eval-rmse:0.091109	train-rmspe:0.079755	eval-rmspe:0.095263
[292]	train-rmse:0.07449	eval-rmse:0.091059	train-rmspe:0.07968	eval-rmspe:0.095199
[293]	train-rmse:0.074415	eval-rmse:0.091041	train-rmspe:0.079558	eval-rmspe:0.095179
[294]	train-rmse:0.074333	eval-rmse:0.091043	train-rmspe:0.079458	eval-rmspe:0.095185
[295]	train-rmse:0.074259	eval-rmse:0.091031	train-rmspe:0.079384	eval-rmspe:0.095174
[296]	train-rmse:0.074147	eval-rmse:0.090993	train-rmspe:0.079229	eval-rmspe:0.095134
[297]	train-rmse:0.074107	eval-rmse:0.090985	train-rmspe:0.079183	eval-rmspe:0.095121
[298]	train-rmse:0.074024	eval-rmse:0.090959	train-rmspe:0.07907	eval-rmspe:0.095094
[299]	train-rmse:0.073964	eval-rmse:0.090937	train-rmspe:0.

In [21]:
print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))

Validating
RMSPE: 0.095087


In [23]:
test['StateHoliday'] = test['StateHoliday'].astype(int)
test['Assortment'] = test['Assortment'].astype(int)

In [24]:
print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)
# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("xgboost_10_submission.csv", index=False)


Make predictions on the test set


In [25]:
# XGB feature importances
# Based on https://www.kaggle.com/mmueller/liberty-mutual-group-property-inspection-prediction/xgb-feature-importance-python/code

create_feature_map(features)
importance = gbm.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

featp = df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
fig_featp = featp.get_figure()
fig_featp.savefig('feature_importance_xgb.png', bbox_inches='tight', pad_inches=1)

#!/usr/bin/python

In [26]:
importance

[('StateHoliday', 497),
 ('Promo2', 2257),
 ('IsPromoMonth', 2610),
 ('Assortment', 3221),
 ('SchoolHoliday', 5213),
 ('StoreType', 5427),
 ('Year', 6569),
 ('Promo', 7504),
 ('Month', 15153),
 ('PromoOpen', 20662),
 ('CompetitionDistance', 22161),
 ('DayOfWeek', 23650),
 ('WeekOfYear', 23755),
 ('Store', 29970),
 ('CompetitionOpen', 30087),
 ('Day', 35520)]

### 2 Idea f02