In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import math
from collections import Counter
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split

In [2]:
store = pd.read_csv("input/store.csv", encoding='utf8')

store.insert(10, 'promo2_date_y', \
        store['Promo2SinceYear'].map(lambda x : datetime(2050,1,1) if pd.isnull(x) else datetime(int(x), 1, 1)))

store['promo2_date_y'] = store['promo2_date_y'] + \
            store['Promo2SinceWeek'].map(lambda x : timedelta(days=0) if pd.isnull(x) else timedelta(days=x*7))

# if CompetitionOpenSinceYear and Month is Nan, then competition_open_date set 1900-01-01
# merge CompetitionOpenSinceYear , Month and 01,  then Year-month-01
store['CompetitionOpenSinceMonth'] = \
    store['CompetitionOpenSinceMonth'].map(lambda x : '1' if pd.isnull(x) else str(int(x)))
store['CompetitionOpenSinceYear'] =  \
    store['CompetitionOpenSinceYear'].map(lambda x : '1900' if pd.isnull(x) else str(int(x)))

store.insert(4, 'competition_open_date', \
                store['CompetitionOpenSinceYear'] + '-' + store['CompetitionOpenSinceMonth'] + '-1')

store['competition_open_date'] = store['competition_open_date'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))

store.drop(['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear'], \
              axis=1, inplace=True)

# store  competitiondistance is null when store = 291 or 622, 879
# max(competitiondistance) = 75,860; if null set as 200,000
store['CompetitionDistance'] = store['CompetitionDistance'].map(lambda x : 200000 if pd.isnull(x) else x)

store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,competition_open_date,Promo2,PromoInterval,promo2_date_y
0,1,c,a,1270.0,2008-09-01,0,,2050-01-01
1,2,a,a,570.0,2007-11-01,1,"Jan,Apr,Jul,Oct",2010-04-02
2,3,a,a,14130.0,2006-12-01,1,"Jan,Apr,Jul,Oct",2011-04-09
3,4,c,c,620.0,2009-09-01,0,,2050-01-01
4,5,a,a,29910.0,2015-04-01,0,,2050-01-01


In [8]:
train = pd.read_csv("input/train.csv", encoding='utf8')

train['Date'] = train['Date'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))
train['Open'] = train['Open'].astype(int)
train['StateHoliday'] = train['StateHoliday'].astype(str)

train_data = pd.merge(train, store, on='Store')
train_data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,competition_open_date,Promo2,PromoInterval,promo2_date_y
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,2008-09-01,0,,2050-01-01
1,1,4,2015-07-30,5020,546,1,1,0,1,c,a,1270.0,2008-09-01,0,,2050-01-01
2,1,3,2015-07-29,4782,523,1,1,0,1,c,a,1270.0,2008-09-01,0,,2050-01-01
3,1,2,2015-07-28,5011,560,1,1,0,1,c,a,1270.0,2008-09-01,0,,2050-01-01
4,1,1,2015-07-27,6102,612,1,1,0,1,c,a,1270.0,2008-09-01,0,,2050-01-01


In [9]:
# consider only data when the store is not open
print('before removing closing days', train_data.shape)
train_data = train_data[train_data['Open'] != 0]
print('after removing closing days', train_data.shape)

before removing closing days (1017209, 16)
after removing closing days (844392, 16)


In [10]:
# add id
train_data.insert(0, 'idx', range(len(train_data)))

train_data.columns = train_data.columns.str.lower()

# judge whether the store is in Promo2 in the Date
# train_promo2 = train_data.loc[:, ['idx', 'date', 'open', 'promo2', 'promoInterval', 'promo2_date_y']] # future warning
train_promo2 = train_data.reindex(['idx', 'date', 'open', 'promo2', 'promointerval', 'promo2_date_y'], axis=1)
train_promo2.head()

Unnamed: 0,idx,date,open,promo2,promointerval,promo2_date_y
0,0,2015-07-31,1,0,,2050-01-01
1,1,2015-07-30,1,0,,2050-01-01
2,2,2015-07-29,1,0,,2050-01-01
3,3,2015-07-28,1,0,,2050-01-01
4,4,2015-07-27,1,0,,2050-01-01


In [11]:
result = {}
month_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
              'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
              'Sept': 5, 'Oct': 10, 'Nov': 11, 'Dec': 12}
# for idx, date, is_open, p2, m, p2_date in test_promo2.itertuples():
for row in train_promo2.itertuples():
    # idx-1, date-2, is_open-3, promo2-4, promoInterval(months)-5, promo2_date_y-6
    if row[3]: # Open == 1
        if row[4]: # Promo2 == 1
            if row[2] > row[6]: # Date > Promo2Since_date
                if pd.isnull(row[5]): # PromoInterval is null
                    result[row[1]] = 0
                else:
                    months = row[5].split(',')
                    if row[2].month in [month_dict[x] for x in months]: # Date.month is in PromoInterval
                        result[row[1]] = 1
                    else:
                        result[row[1]] = 0
            else:
                result[row[1]] = 0
        else:
            result[row[1]] = 0
    else:
        result[row[1]] = 0
print(len(result))

844392


In [12]:
d1 = pd.DataFrame.from_dict(result, orient='index')
d1.reset_index(inplace=True)
d1.columns = ['idx', 'new_promo2']
d1.head()

Unnamed: 0,idx,new_promo2
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [13]:
train_data = pd.merge(train_data, d1, how='left', on='idx')
train_data.drop(['promo2', 'promointerval', 'promo2_date_y'], axis=1, inplace=True)
train_data.head()

Unnamed: 0,idx,store,dayofweek,date,sales,customers,open,promo,stateholiday,schoolholiday,storetype,assortment,competitiondistance,competition_open_date,new_promo2
0,0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,2008-09-01,0
1,1,1,4,2015-07-30,5020,546,1,1,0,1,c,a,1270.0,2008-09-01,0
2,2,1,3,2015-07-29,4782,523,1,1,0,1,c,a,1270.0,2008-09-01,0
3,3,1,2,2015-07-28,5011,560,1,1,0,1,c,a,1270.0,2008-09-01,0
4,4,1,1,2015-07-27,6102,612,1,1,0,1,c,a,1270.0,2008-09-01,0


In [14]:
# judge whether the competition store is opened before the store
train_data.insert(13, 'is_competition', train_data['competition_open_date'] < train_data['date'])

Counter(train_data['is_competition'])

Counter({True: 774160, False: 70232})

In [15]:
# trans competition factor into competition distances
## if is_competition is false then competition distances = 100000
## competition distances between 20 and 75860
## competition distances += (1 - is_competition) * 100000
train_data['competitiondistance'] += (1 - train_data['is_competition']) * 100000

max(train_data['competitiondistance'])

200000.0

In [16]:
# trans date to year, month
# time_year - 2012 : flag of years
train_data.insert(4, 'time_year', [time.year for time in train_data['date']])
train_data.insert(5, 'time_month', [time.month for time in train_data['date']])
train_data.insert(6, 'week_of_year', [((x - datetime(x.year, 1, 1)).days + 1)//7 + 1 for x in train_data['date']])

In [18]:
stateholiday_dict = {'0': 0, 'a': 1, 'b': 1, 'c': 1}
train_data['stateholiday'] = train_data['stateholiday'].map(stateholiday_dict)

storetype_dict = {'a': 1, 'b': 2, 'c': 3, 'd': 4}
train_data['storetype'] = train_data['storetype'].map(storetype_dict).astype(float)

assortment_dict = {'a': 1, 'b': 2, 'c': 3}
train_data['assortment'] = train_data['assortment'].map(assortment_dict).astype(float)

train_data.drop(['idx', 'customers', 'competition_open_date'], axis=1, inplace=True)

train_data.sort_values(by='date', inplace=True)

In [28]:
# ['Store',
#  'CompetitionDistance',
#  'CompetitionOpenSinceMonth',
#  'CompetitionOpenSinceYear',
#  'Promo',
#  'Promo2',
#  'Promo2SinceWeek',
#  'Promo2SinceYear',
#  'SchoolHoliday',
#  'DayOfWeek',
#  'month',
#  'day',
#  'year',
#  'StoreType',
#  'Assortment']

features = ['store', 'dayofweek', 'time_year', 'time_month', 'week_of_year',
            'promo', 'new_promo2', 'stateholiday', 'schoolholiday', 'storetype',
            'assortment', 'competitiondistance', 'is_competition']

In [33]:
test = pd.read_csv("input/test.csv", encoding='utf8')

test['Date'] = test['Date'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))
test['Open'] = test['Open'].astype(float)
test['StateHoliday'] = test['StateHoliday'].astype(str)

test_data = pd.merge(test, store, on='Store')

# add id
test_data.insert(0, 'idx', range(len(test_data)))

test_data.columns = test_data.columns.str.lower()

# judge whether the store is in Promo2 in the Date
# train_promo2 = train_data.loc[:, ['idx', 'date', 'open', 'promo2', 'promoInterval', 'promo2_date_y']] # future warning
test_promo2 = test_data.reindex(['idx', 'date', 'open', 'promo2', 'promointerval', 'promo2_date_y'], axis=1)

result = {}
month_dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
              'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
              'Sept': 5, 'Oct': 10, 'Nov': 11, 'Dec': 12}
# for idx, date, is_open, p2, m, p2_date in test_promo2.itertuples():
for row in test_promo2.itertuples():
    # idx-1, date-2, is_open-3, promo2-4, promoInterval(months)-5, promo2_date_y-6
    if row[3]: # Open == 1
        if row[4]: # Promo2 == 1
            if row[2] > row[6]: # Date > Promo2Since_date
                if pd.isnull(row[5]): # PromoInterval is null
                    result[row[1]] = 0
                else:
                    months = row[5].split(',')
                    if row[2].month in [month_dict[x] for x in months]: # Date.month is in PromoInterval
                        result[row[1]] = 1
                    else:
                        result[row[1]] = 0
            else:
                result[row[1]] = 0
        else:
            result[row[1]] = 0
    else:
        result[row[1]] = 0
print(len(result))

d1t = pd.DataFrame.from_dict(result, orient='index')
d1t.reset_index(inplace=True)
d1t.columns = ['idx', 'new_promo2']
# d1t.head()


test_data = pd.merge(test_data, d1t, how='left', on='idx')
test_data.drop(['promo2', 'promointerval', 'promo2_date_y'], axis=1, inplace=True)
# test_data.head()

# judge whether the competition store is opened before the store
test_data.insert(13, 'is_competition', test_data['competition_open_date'] < test_data['date'])

test_data['competitiondistance'] += (1 - test_data['is_competition']) * 100000

test_data.insert(4, 'time_year', [time.year for time in test_data['date']])
test_data.insert(5, 'time_month', [time.month for time in test_data['date']])
test_data.insert(6, 'week_of_year', [((x - datetime(x.year, 1, 1)).days + 1)//7 + 1 for x in test_data['date']])

stateholiday_dict = {'0': 0, 'a': 1, 'b': 1, 'c': 1}
test_data['stateholiday'] = test_data['stateholiday'].map(stateholiday_dict)

storetype_dict = {'a': 1, 'b': 2, 'c': 3, 'd': 4}
test_data['storetype'] = test_data['storetype'].map(storetype_dict).astype(float)

assortment_dict = {'a': 1, 'b': 2, 'c': 3}
test_data['assortment'] = test_data['assortment'].map(assortment_dict).astype(float)

test_data.head()

41088


Unnamed: 0,idx,id,store,dayofweek,time_year,time_month,week_of_year,date,open,promo,stateholiday,schoolholiday,storetype,assortment,competitiondistance,competition_open_date,is_competition,new_promo2
0,0,1,1,4,2015,9,38,2015-09-17,1.0,1,0,0,3.0,1.0,1270.0,2008-09-01,True,0
1,1,857,1,3,2015,9,38,2015-09-16,1.0,1,0,0,3.0,1.0,1270.0,2008-09-01,True,0
2,2,1713,1,2,2015,9,37,2015-09-15,1.0,1,0,0,3.0,1.0,1270.0,2008-09-01,True,0
3,3,2569,1,1,2015,9,37,2015-09-14,1.0,1,0,0,3.0,1.0,1270.0,2008-09-01,True,0
4,4,3425,1,7,2015,9,37,2015-09-13,0.0,0,0,0,3.0,1.0,1270.0,2008-09-01,True,0


In [34]:
test_data[features].head()

Unnamed: 0,store,dayofweek,time_year,time_month,week_of_year,promo,new_promo2,stateholiday,schoolholiday,storetype,assortment,competitiondistance,is_competition
0,1,4,2015,9,38,1,0,0,0,3.0,1.0,1270.0,True
1,1,3,2015,9,38,1,0,0,0,3.0,1.0,1270.0,True
2,1,2,2015,9,37,1,0,0,0,3.0,1.0,1270.0,True
3,1,1,2015,9,37,1,0,0,0,3.0,1.0,1270.0,True
4,1,7,2015,9,37,0,0,0,0,3.0,1.0,1270.0,True


In [35]:
from sklearn.model_selection import train_test_split

x_train, x_val = train_test_split(train_data, test_size= 0.01, shuffle=False, random_state=0)
print(x_train.shape, x_val.shape)

(835948, 16) (8444, 16)


In [47]:
x_val.head()

Unnamed: 0,store,dayofweek,date,time_year,time_month,week_of_year,sales,open,promo,stateholiday,schoolholiday,storetype,assortment,competitiondistance,is_competition,new_promo2
40697,55,4,2015-07-23,2015,7,30,4406,1,0,0,1,1.0,1.0,720.0,True,0
145840,196,4,2015-07-23,2015,7,30,4406,1,0,0,1,3.0,1.0,3850.0,True,1
270006,358,4,2015-07-23,2015,7,30,7221,1,0,0,1,1.0,1.0,2890.0,True,0
783682,1036,4,2015-07-23,2015,7,30,4903,1,0,0,1,4.0,3.0,9560.0,True,1
186504,249,4,2015-07-23,2015,7,30,5163,1,0,0,1,4.0,3.0,18010.0,True,0


In [36]:
# import xgboost as xgb
dtrain = xgb.DMatrix(x_train[features], np.log(x_train['sales'] + 1))
dval = xgb.DMatrix(x_val[features], np.log(x_val['sales'] + 1))
dtest = xgb.DMatrix(test_data[features])

watchlist = [(dval, 'eval'), (dtrain, 'train')]

In [38]:
def to_weight(y):
    w = np.zeros_like(y, dtype=float)
    ind = (y != 0)
    w[ind] = 1. / (y[ind] ** 2)
    return w

def rmspe(yhat, y):
    w = to_weight(y)
    return np.sqrt(np.mean(w * (y - yhat)**2)) 

def rmspe_xg(yhat, y):
    # with data after preprocessing
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = to_weight(y)
    return "rmspe", np.sqrt(np.mean(w * (y - yhat)**2)) 

In [39]:
params = {"objective": "reg:linear", "eta": 0.3, "max_depth": 8, 
          "subsample": 0.7, "colsample_bytree": 0.7, "silent": 1}
num_trees = 300

gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, 
                feval=rmspe_xg, verbose_eval=True)

[0]	eval-rmse:5.8687	train-rmse:5.79369	eval-rmspe:0.997058	train-rmspe:0.99679
Multiple eval metrics have been passed: 'train-rmspe' will be used for early stopping.

Will train until train-rmspe hasn't improved in 50 rounds.
[1]	eval-rmse:4.1218	train-rmse:4.06437	eval-rmspe:0.982432	train-rmspe:0.981324
[2]	eval-rmse:2.90066	train-rmse:2.8562	eval-rmspe:0.940472	train-rmspe:0.937583
[3]	eval-rmse:2.04569	train-rmse:2.01474	eval-rmspe:0.860258	train-rmspe:0.855556
[4]	eval-rmse:1.46387	train-rmse:1.43177	eval-rmspe:0.750853	train-rmspe:0.742364
[5]	eval-rmse:1.05366	train-rmse:1.03029	eval-rmspe:0.627295	train-rmspe:0.618477
[6]	eval-rmse:0.773134	train-rmse:0.758647	eval-rmspe:0.511788	train-rmspe:0.505847
[7]	eval-rmse:0.58871	train-rmse:0.578839	eval-rmspe:0.421402	train-rmspe:0.419663
[8]	eval-rmse:0.468747	train-rmse:0.462183	eval-rmspe:0.358978	train-rmspe:0.362107
[9]	eval-rmse:0.392943	train-rmse:0.393421	eval-rmspe:0.323145	train-rmspe:0.33472
[10]	eval-rmse:0.350156	train-r

[96]	eval-rmse:0.160663	train-rmse:0.157084	eval-rmspe:0.165659	train-rmspe:0.193705
[97]	eval-rmse:0.160575	train-rmse:0.15664	eval-rmspe:0.165229	train-rmspe:0.193111
[98]	eval-rmse:0.160202	train-rmse:0.156283	eval-rmspe:0.164912	train-rmspe:0.192734
[99]	eval-rmse:0.159846	train-rmse:0.15609	eval-rmspe:0.164739	train-rmspe:0.192505
[100]	eval-rmse:0.159572	train-rmse:0.155755	eval-rmspe:0.164444	train-rmspe:0.192204
[101]	eval-rmse:0.15918	train-rmse:0.155282	eval-rmspe:0.163981	train-rmspe:0.19177
[102]	eval-rmse:0.158906	train-rmse:0.154959	eval-rmspe:0.163693	train-rmspe:0.19147
[103]	eval-rmse:0.158789	train-rmse:0.154812	eval-rmspe:0.163554	train-rmspe:0.191229
[104]	eval-rmse:0.158585	train-rmse:0.154609	eval-rmspe:0.163258	train-rmspe:0.191041
[105]	eval-rmse:0.157554	train-rmse:0.153661	eval-rmspe:0.162096	train-rmspe:0.190113
[106]	eval-rmse:0.157136	train-rmse:0.153206	eval-rmspe:0.161633	train-rmspe:0.18878
[107]	eval-rmse:0.156827	train-rmse:0.152892	eval-rmspe:0.161266

[192]	eval-rmse:0.139094	train-rmse:0.128088	eval-rmspe:0.141343	train-rmspe:0.167216
[193]	eval-rmse:0.138965	train-rmse:0.127907	eval-rmspe:0.14121	train-rmspe:0.166997
[194]	eval-rmse:0.138772	train-rmse:0.127642	eval-rmspe:0.140986	train-rmspe:0.166772
[195]	eval-rmse:0.13879	train-rmse:0.127519	eval-rmspe:0.140952	train-rmspe:0.166674
[196]	eval-rmse:0.138707	train-rmse:0.127412	eval-rmspe:0.140948	train-rmspe:0.166566
[197]	eval-rmse:0.13862	train-rmse:0.127235	eval-rmspe:0.140852	train-rmspe:0.166512
[198]	eval-rmse:0.138575	train-rmse:0.127109	eval-rmspe:0.140807	train-rmspe:0.166387
[199]	eval-rmse:0.138188	train-rmse:0.126786	eval-rmspe:0.140291	train-rmspe:0.166111
[200]	eval-rmse:0.13808	train-rmse:0.126603	eval-rmspe:0.140214	train-rmspe:0.166002
[201]	eval-rmse:0.13806	train-rmse:0.126445	eval-rmspe:0.140148	train-rmspe:0.165874
[202]	eval-rmse:0.137939	train-rmse:0.126347	eval-rmspe:0.140005	train-rmspe:0.165806
[203]	eval-rmse:0.137853	train-rmse:0.126244	eval-rmspe:0.1

[288]	eval-rmse:0.132512	train-rmse:0.116145	eval-rmspe:0.134047	train-rmspe:0.144238
[289]	eval-rmse:0.132474	train-rmse:0.11607	eval-rmspe:0.134007	train-rmspe:0.14432
[290]	eval-rmse:0.132413	train-rmse:0.116011	eval-rmspe:0.133931	train-rmspe:0.144223
[291]	eval-rmse:0.132471	train-rmse:0.115952	eval-rmspe:0.134055	train-rmspe:0.144103
[292]	eval-rmse:0.132388	train-rmse:0.115877	eval-rmspe:0.133945	train-rmspe:0.144042
[293]	eval-rmse:0.132379	train-rmse:0.115792	eval-rmspe:0.133884	train-rmspe:0.143986
[294]	eval-rmse:0.132332	train-rmse:0.1157	eval-rmspe:0.133834	train-rmspe:0.143936
[295]	eval-rmse:0.132281	train-rmse:0.115648	eval-rmspe:0.133763	train-rmspe:0.143879
[296]	eval-rmse:0.132313	train-rmse:0.115614	eval-rmspe:0.133779	train-rmspe:0.143907
[297]	eval-rmse:0.132232	train-rmse:0.11552	eval-rmspe:0.133718	train-rmspe:0.143815
[298]	eval-rmse:0.132136	train-rmse:0.11542	eval-rmspe:0.133611	train-rmspe:0.143732
[299]	eval-rmse:0.132099	train-rmse:0.115389	eval-rmspe:0.13

In [41]:
## validation
train_probs = gbm.predict(xgb.DMatrix(x_val[features]))
indices = train_probs < 0
train_probs[indices] = 0

error = rmspe(np.exp(train_probs) - 1, x_val['sales'].values)
print('error: ', error)

error:  0.13357803243515334


In [45]:
## testing
test_probs = gbm.predict(xgb.DMatrix(test_data[features]))
indices = test_probs < 0
test_probs[indices] = 0

submission = pd.DataFrame({'Id': test_data['id'], 'Sales': np.exp(test_probs) - 1})
submission.to_csv('xgboost_submission4.csv', index=False) ##0.12176

In [48]:
features = ['store', 'dayofweek', 'time_year', 'time_month', 'week_of_year',
            'promo', 'new_promo2', 'schoolholiday', 'storetype',
            'assortment', 'competitiondistance', 'is_competition']

dtrain = xgb.DMatrix(x_train[features], np.log(x_train['sales'] + 1))
dval = xgb.DMatrix(x_val[features], np.log(x_val['sales'] + 1))
dtest = xgb.DMatrix(test_data[features])

watchlist = [(dval, 'eval'), (dtrain, 'train')]

params = {"objective": "reg:linear", "eta": 0.3, "max_depth": 8, 
          "subsample": 0.7, "colsample_bytree": 0.7, "silent": 1}
num_trees = 300

gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, 
                feval=rmspe_xg, verbose_eval=True)

[0]	eval-rmse:5.86802	train-rmse:5.79378	eval-rmspe:0.997052	train-rmspe:0.996787
Multiple eval metrics have been passed: 'train-rmspe' will be used for early stopping.

Will train until train-rmspe hasn't improved in 50 rounds.
[1]	eval-rmse:4.12096	train-rmse:4.06431	eval-rmspe:0.982449	train-rmspe:0.981318
[2]	eval-rmse:2.91233	train-rmse:2.8573	eval-rmspe:0.941067	train-rmspe:0.937367
[3]	eval-rmse:2.06112	train-rmse:2.0157	eval-rmspe:0.862424	train-rmspe:0.855418
[4]	eval-rmse:1.46889	train-rmse:1.43307	eval-rmspe:0.752158	train-rmspe:0.742313
[5]	eval-rmse:1.056	train-rmse:1.0335	eval-rmspe:0.627138	train-rmspe:0.618822
[6]	eval-rmse:0.779772	train-rmse:0.764373	eval-rmspe:0.512843	train-rmspe:0.507931
[7]	eval-rmse:0.592422	train-rmse:0.588992	eval-rmspe:0.421635	train-rmspe:0.426274
[8]	eval-rmse:0.477058	train-rmse:0.474959	eval-rmspe:0.363906	train-rmspe:0.375714
[9]	eval-rmse:0.404838	train-rmse:0.406568	eval-rmspe:0.332301	train-rmspe:0.353179
[10]	eval-rmse:0.357808	train-

[96]	eval-rmse:0.165813	train-rmse:0.163708	eval-rmspe:0.168914	train-rmspe:0.203347
[97]	eval-rmse:0.16528	train-rmse:0.162597	eval-rmspe:0.168264	train-rmspe:0.202303
[98]	eval-rmse:0.164603	train-rmse:0.162188	eval-rmspe:0.167749	train-rmspe:0.202212
[99]	eval-rmse:0.163897	train-rmse:0.161393	eval-rmspe:0.166348	train-rmspe:0.201375
[100]	eval-rmse:0.163065	train-rmse:0.161073	eval-rmspe:0.165952	train-rmspe:0.20101
[101]	eval-rmse:0.162614	train-rmse:0.160443	eval-rmspe:0.1654	train-rmspe:0.200421
[102]	eval-rmse:0.162334	train-rmse:0.16012	eval-rmspe:0.165122	train-rmspe:0.200132
[103]	eval-rmse:0.161668	train-rmse:0.159429	eval-rmspe:0.164441	train-rmspe:0.198959
[104]	eval-rmse:0.16104	train-rmse:0.159163	eval-rmspe:0.163948	train-rmspe:0.19885
[105]	eval-rmse:0.161214	train-rmse:0.158588	eval-rmspe:0.163489	train-rmspe:0.198465
[106]	eval-rmse:0.161215	train-rmse:0.158386	eval-rmspe:0.163491	train-rmspe:0.198289
[107]	eval-rmse:0.160785	train-rmse:0.157693	eval-rmspe:0.162978	

[192]	eval-rmse:0.136993	train-rmse:0.129556	eval-rmspe:0.138613	train-rmspe:0.168186
[193]	eval-rmse:0.136964	train-rmse:0.129515	eval-rmspe:0.138628	train-rmspe:0.168137
[194]	eval-rmse:0.136945	train-rmse:0.129428	eval-rmspe:0.138798	train-rmspe:0.168086
[195]	eval-rmse:0.136921	train-rmse:0.12924	eval-rmspe:0.138781	train-rmspe:0.167921
[196]	eval-rmse:0.136666	train-rmse:0.12879	eval-rmspe:0.138419	train-rmspe:0.167516
[197]	eval-rmse:0.136651	train-rmse:0.128647	eval-rmspe:0.1384	train-rmspe:0.16749
[198]	eval-rmse:0.136435	train-rmse:0.128428	eval-rmspe:0.138105	train-rmspe:0.167305
[199]	eval-rmse:0.136322	train-rmse:0.128287	eval-rmspe:0.137947	train-rmspe:0.167075
[200]	eval-rmse:0.136241	train-rmse:0.128043	eval-rmspe:0.137862	train-rmspe:0.166227
[201]	eval-rmse:0.136049	train-rmse:0.127903	eval-rmspe:0.137697	train-rmspe:0.166207
[202]	eval-rmse:0.135909	train-rmse:0.127703	eval-rmspe:0.137537	train-rmspe:0.166056
[203]	eval-rmse:0.135886	train-rmse:0.127675	eval-rmspe:0.1

[288]	eval-rmse:0.129326	train-rmse:0.117254	eval-rmspe:0.132173	train-rmspe:0.158146
[289]	eval-rmse:0.129062	train-rmse:0.117153	eval-rmspe:0.132019	train-rmspe:0.157997
[290]	eval-rmse:0.129396	train-rmse:0.117105	eval-rmspe:0.132278	train-rmspe:0.157979
[291]	eval-rmse:0.129319	train-rmse:0.117023	eval-rmspe:0.132178	train-rmspe:0.157864
[292]	eval-rmse:0.129323	train-rmse:0.116973	eval-rmspe:0.132206	train-rmspe:0.157877
[293]	eval-rmse:0.129283	train-rmse:0.116918	eval-rmspe:0.132163	train-rmspe:0.157657
[294]	eval-rmse:0.129298	train-rmse:0.116892	eval-rmspe:0.132201	train-rmspe:0.157609
[295]	eval-rmse:0.12929	train-rmse:0.11687	eval-rmspe:0.132194	train-rmspe:0.15759
[296]	eval-rmse:0.12947	train-rmse:0.116833	eval-rmspe:0.132251	train-rmspe:0.157464
[297]	eval-rmse:0.129386	train-rmse:0.116759	eval-rmspe:0.13214	train-rmspe:0.157416
[298]	eval-rmse:0.129652	train-rmse:0.116686	eval-rmspe:0.132468	train-rmspe:0.157325
[299]	eval-rmse:0.129628	train-rmse:0.116625	eval-rmspe:0.1

In [49]:
## validation
train_probs = gbm.predict(xgb.DMatrix(x_val[features]))
indices = train_probs < 0
train_probs[indices] = 0

error = rmspe(np.exp(train_probs) - 1, x_val['sales'].values)
print('error: ', error)

## testing
test_probs = gbm.predict(xgb.DMatrix(test_data[features]))
indices = test_probs < 0
test_probs[indices] = 0

submission = pd.DataFrame({'Id': test_data['id'], 'Sales': np.exp(test_probs) - 1})
submission.to_csv('xgboost_submission5.csv', index=False) ##0.12154

error:  0.13246705476841594


In [50]:
features = ['store', 'dayofweek', 'time_year', 'time_month', 'week_of_year',
            'promo', 'new_promo2', 'schoolholiday', 'storetype',
            'assortment', 'competitiondistance']

dtrain = xgb.DMatrix(x_train[features], np.log(x_train['sales'] + 1))
dval = xgb.DMatrix(x_val[features], np.log(x_val['sales'] + 1))
dtest = xgb.DMatrix(test_data[features])

watchlist = [(dval, 'eval'), (dtrain, 'train')]

params = {"objective": "reg:linear", "eta": 0.3, "max_depth": 8, 
          "subsample": 0.7, "colsample_bytree": 0.7, "silent": 1}
num_trees = 300

gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, 
                feval=rmspe_xg, verbose_eval=True)

[0]	eval-rmse:5.89268	train-rmse:5.79453	eval-rmspe:0.997116	train-rmspe:0.996776
Multiple eval metrics have been passed: 'train-rmspe' will be used for early stopping.

Will train until train-rmspe hasn't improved in 50 rounds.
[1]	eval-rmse:4.14251	train-rmse:4.06458	eval-rmspe:0.982788	train-rmspe:0.981316
[2]	eval-rmse:2.92242	train-rmse:2.85647	eval-rmspe:0.941739	train-rmspe:0.937517
[3]	eval-rmse:2.06878	train-rmse:2.01483	eval-rmspe:0.863636	train-rmspe:0.855596
[4]	eval-rmse:1.47301	train-rmse:1.43084	eval-rmspe:0.754004	train-rmspe:0.742621
[5]	eval-rmse:1.06496	train-rmse:1.02961	eval-rmspe:0.631823	train-rmspe:0.618603
[6]	eval-rmse:0.774956	train-rmse:0.759223	eval-rmspe:0.512226	train-rmspe:0.506285
[7]	eval-rmse:0.587113	train-rmse:0.580136	eval-rmspe:0.419393	train-rmspe:0.420609
[8]	eval-rmse:0.466269	train-rmse:0.465142	eval-rmspe:0.356336	train-rmspe:0.366155
[9]	eval-rmse:0.392244	train-rmse:0.39633	eval-rmspe:0.322108	train-rmspe:0.340289
[10]	eval-rmse:0.34731	tra

[96]	eval-rmse:0.167326	train-rmse:0.161561	eval-rmspe:0.176906	train-rmspe:0.199283
[97]	eval-rmse:0.166627	train-rmse:0.160543	eval-rmspe:0.176051	train-rmspe:0.199631
[98]	eval-rmse:0.166513	train-rmse:0.16033	eval-rmspe:0.1759	train-rmspe:0.199452
[99]	eval-rmse:0.165924	train-rmse:0.159455	eval-rmspe:0.175161	train-rmspe:0.198528
[100]	eval-rmse:0.165039	train-rmse:0.158401	eval-rmspe:0.174026	train-rmspe:0.197575
[101]	eval-rmse:0.164471	train-rmse:0.157842	eval-rmspe:0.173106	train-rmspe:0.19701
[102]	eval-rmse:0.164172	train-rmse:0.157532	eval-rmspe:0.172662	train-rmspe:0.196704
[103]	eval-rmse:0.163848	train-rmse:0.157196	eval-rmspe:0.172259	train-rmspe:0.196408
[104]	eval-rmse:0.163321	train-rmse:0.156698	eval-rmspe:0.171717	train-rmspe:0.195158
[105]	eval-rmse:0.163491	train-rmse:0.155875	eval-rmspe:0.172477	train-rmspe:0.194191
[106]	eval-rmse:0.1631	train-rmse:0.155384	eval-rmspe:0.171954	train-rmspe:0.194161
[107]	eval-rmse:0.162774	train-rmse:0.155121	eval-rmspe:0.171409

[192]	eval-rmse:0.14299	train-rmse:0.130466	eval-rmspe:0.148988	train-rmspe:0.170767
[193]	eval-rmse:0.142676	train-rmse:0.130086	eval-rmspe:0.148635	train-rmspe:0.17049
[194]	eval-rmse:0.142618	train-rmse:0.129991	eval-rmspe:0.148576	train-rmspe:0.170328
[195]	eval-rmse:0.142554	train-rmse:0.129888	eval-rmspe:0.148512	train-rmspe:0.170249
[196]	eval-rmse:0.142537	train-rmse:0.129668	eval-rmspe:0.148665	train-rmspe:0.170153
[197]	eval-rmse:0.142397	train-rmse:0.129615	eval-rmspe:0.148536	train-rmspe:0.170021
[198]	eval-rmse:0.142323	train-rmse:0.129466	eval-rmspe:0.148428	train-rmspe:0.169916
[199]	eval-rmse:0.142298	train-rmse:0.129425	eval-rmspe:0.148373	train-rmspe:0.169912
[200]	eval-rmse:0.142196	train-rmse:0.129224	eval-rmspe:0.148244	train-rmspe:0.169714
[201]	eval-rmse:0.142166	train-rmse:0.129095	eval-rmspe:0.148208	train-rmspe:0.169591
[202]	eval-rmse:0.142149	train-rmse:0.129056	eval-rmspe:0.148213	train-rmspe:0.169546
[203]	eval-rmse:0.142057	train-rmse:0.128935	eval-rmspe:

[288]	eval-rmse:0.137116	train-rmse:0.118408	eval-rmspe:0.143284	train-rmspe:0.151485
[289]	eval-rmse:0.137037	train-rmse:0.118298	eval-rmspe:0.143159	train-rmspe:0.15142
[290]	eval-rmse:0.13697	train-rmse:0.1182	eval-rmspe:0.143072	train-rmspe:0.151331
[291]	eval-rmse:0.136944	train-rmse:0.118141	eval-rmspe:0.143085	train-rmspe:0.151198
[292]	eval-rmse:0.13679	train-rmse:0.118015	eval-rmspe:0.14288	train-rmspe:0.151098
[293]	eval-rmse:0.136761	train-rmse:0.117946	eval-rmspe:0.142846	train-rmspe:0.150974
[294]	eval-rmse:0.137056	train-rmse:0.117877	eval-rmspe:0.143014	train-rmspe:0.150939
[295]	eval-rmse:0.137014	train-rmse:0.117812	eval-rmspe:0.142967	train-rmspe:0.150682
[296]	eval-rmse:0.136846	train-rmse:0.11764	eval-rmspe:0.142802	train-rmspe:0.150485
[297]	eval-rmse:0.136779	train-rmse:0.117545	eval-rmspe:0.142773	train-rmspe:0.150406
[298]	eval-rmse:0.136754	train-rmse:0.117492	eval-rmspe:0.142753	train-rmspe:0.150361
[299]	eval-rmse:0.136811	train-rmse:0.117475	eval-rmspe:0.142

In [51]:
## validation
train_probs = gbm.predict(xgb.DMatrix(x_val[features]))
indices = train_probs < 0
train_probs[indices] = 0

error = rmspe(np.exp(train_probs) - 1, x_val['sales'].values)
print('error: ', error)

## testing
test_probs = gbm.predict(xgb.DMatrix(test_data[features]))
indices = test_probs < 0
test_probs[indices] = 0

submission = pd.DataFrame({'Id': test_data['id'], 'Sales': np.exp(test_probs) - 1})
submission.to_csv('xgboost_submission6.csv', index=False) ##0.12357

error:  0.14285933508594792


In [62]:
# train_data['competitiondistance_log'] = train_data['competitiondistance'].apply(lambda x : np.log(x+1))
test_data['competitiondistance_log'] = test_data['competitiondistance'].apply(lambda x : np.log(x+1))

In [60]:
x_train, x_val = train_test_split(train_data, test_size= 0.01, shuffle=False, random_state=0)

In [63]:
features = ['store', 'dayofweek', 'time_year', 'time_month', 'week_of_year',
            'promo', 'new_promo2', 'schoolholiday', 'storetype',
            'assortment', 'competitiondistance_log', 'is_competition']

dtrain = xgb.DMatrix(x_train[features], np.log(x_train['sales'] + 1))
dval = xgb.DMatrix(x_val[features], np.log(x_val['sales'] + 1))
dtest = xgb.DMatrix(test_data[features])

watchlist = [(dval, 'eval'), (dtrain, 'train')]

params = {"objective": "reg:linear", "eta": 0.3, "max_depth": 8, 
          "subsample": 0.7, "colsample_bytree": 0.7, "silent": 1}
num_trees = 300

gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, 
                feval=rmspe_xg, verbose_eval=True)

[0]	eval-rmse:5.86802	train-rmse:5.79378	eval-rmspe:0.997052	train-rmspe:0.996787
Multiple eval metrics have been passed: 'train-rmspe' will be used for early stopping.

Will train until train-rmspe hasn't improved in 50 rounds.
[1]	eval-rmse:4.12096	train-rmse:4.06431	eval-rmspe:0.982449	train-rmspe:0.981318
[2]	eval-rmse:2.91233	train-rmse:2.8573	eval-rmspe:0.941067	train-rmspe:0.937367
[3]	eval-rmse:2.06112	train-rmse:2.0157	eval-rmspe:0.862424	train-rmspe:0.855418
[4]	eval-rmse:1.46889	train-rmse:1.43307	eval-rmspe:0.752158	train-rmspe:0.742313
[5]	eval-rmse:1.056	train-rmse:1.0335	eval-rmspe:0.627138	train-rmspe:0.618822
[6]	eval-rmse:0.779772	train-rmse:0.764373	eval-rmspe:0.512843	train-rmspe:0.507931
[7]	eval-rmse:0.592422	train-rmse:0.588992	eval-rmspe:0.421635	train-rmspe:0.426274
[8]	eval-rmse:0.477058	train-rmse:0.474959	eval-rmspe:0.363906	train-rmspe:0.375714
[9]	eval-rmse:0.404838	train-rmse:0.406568	eval-rmspe:0.332301	train-rmspe:0.353179
[10]	eval-rmse:0.357808	train-

[96]	eval-rmse:0.165813	train-rmse:0.163708	eval-rmspe:0.168914	train-rmspe:0.203347
[97]	eval-rmse:0.16528	train-rmse:0.162597	eval-rmspe:0.168264	train-rmspe:0.202303
[98]	eval-rmse:0.164603	train-rmse:0.162188	eval-rmspe:0.167749	train-rmspe:0.202212
[99]	eval-rmse:0.163897	train-rmse:0.161393	eval-rmspe:0.166348	train-rmspe:0.201375
[100]	eval-rmse:0.163065	train-rmse:0.161073	eval-rmspe:0.165952	train-rmspe:0.20101
[101]	eval-rmse:0.162614	train-rmse:0.160443	eval-rmspe:0.1654	train-rmspe:0.200421
[102]	eval-rmse:0.162334	train-rmse:0.16012	eval-rmspe:0.165122	train-rmspe:0.200132
[103]	eval-rmse:0.161668	train-rmse:0.159429	eval-rmspe:0.164441	train-rmspe:0.198959
[104]	eval-rmse:0.16104	train-rmse:0.159163	eval-rmspe:0.163948	train-rmspe:0.19885
[105]	eval-rmse:0.161214	train-rmse:0.158588	eval-rmspe:0.163489	train-rmspe:0.198465
[106]	eval-rmse:0.161215	train-rmse:0.158386	eval-rmspe:0.163491	train-rmspe:0.198289
[107]	eval-rmse:0.160785	train-rmse:0.157693	eval-rmspe:0.162978	

[192]	eval-rmse:0.136993	train-rmse:0.129556	eval-rmspe:0.138613	train-rmspe:0.168186
[193]	eval-rmse:0.136964	train-rmse:0.129515	eval-rmspe:0.138628	train-rmspe:0.168137
[194]	eval-rmse:0.136945	train-rmse:0.129428	eval-rmspe:0.138798	train-rmspe:0.168086
[195]	eval-rmse:0.136921	train-rmse:0.12924	eval-rmspe:0.138781	train-rmspe:0.167921
[196]	eval-rmse:0.136666	train-rmse:0.12879	eval-rmspe:0.138419	train-rmspe:0.167516
[197]	eval-rmse:0.136651	train-rmse:0.128647	eval-rmspe:0.1384	train-rmspe:0.16749
[198]	eval-rmse:0.136435	train-rmse:0.128428	eval-rmspe:0.138105	train-rmspe:0.167305
[199]	eval-rmse:0.136322	train-rmse:0.128287	eval-rmspe:0.137947	train-rmspe:0.167075
[200]	eval-rmse:0.136241	train-rmse:0.128043	eval-rmspe:0.137862	train-rmspe:0.166227
[201]	eval-rmse:0.136049	train-rmse:0.127903	eval-rmspe:0.137697	train-rmspe:0.166207
[202]	eval-rmse:0.135909	train-rmse:0.127703	eval-rmspe:0.137537	train-rmspe:0.166056
[203]	eval-rmse:0.135886	train-rmse:0.127675	eval-rmspe:0.1

[288]	eval-rmse:0.129326	train-rmse:0.117254	eval-rmspe:0.132173	train-rmspe:0.158146
[289]	eval-rmse:0.129062	train-rmse:0.117153	eval-rmspe:0.132019	train-rmspe:0.157997
[290]	eval-rmse:0.129396	train-rmse:0.117105	eval-rmspe:0.132278	train-rmspe:0.157979
[291]	eval-rmse:0.129319	train-rmse:0.117023	eval-rmspe:0.132178	train-rmspe:0.157864
[292]	eval-rmse:0.129323	train-rmse:0.116973	eval-rmspe:0.132206	train-rmspe:0.157877
[293]	eval-rmse:0.129283	train-rmse:0.116918	eval-rmspe:0.132163	train-rmspe:0.157657
[294]	eval-rmse:0.129298	train-rmse:0.116892	eval-rmspe:0.132201	train-rmspe:0.157609
[295]	eval-rmse:0.12929	train-rmse:0.11687	eval-rmspe:0.132194	train-rmspe:0.15759
[296]	eval-rmse:0.12947	train-rmse:0.116833	eval-rmspe:0.132251	train-rmspe:0.157464
[297]	eval-rmse:0.129386	train-rmse:0.116759	eval-rmspe:0.13214	train-rmspe:0.157416
[298]	eval-rmse:0.129652	train-rmse:0.116686	eval-rmspe:0.132468	train-rmspe:0.157325
[299]	eval-rmse:0.129628	train-rmse:0.116625	eval-rmspe:0.1

In [64]:
## validation
train_probs = gbm.predict(xgb.DMatrix(x_val[features]))
indices = train_probs < 0
train_probs[indices] = 0

error = rmspe(np.exp(train_probs) - 1, x_val['sales'].values)
print('error: ', error)

## testing
test_probs = gbm.predict(xgb.DMatrix(test_data[features]))
indices = test_probs < 0
test_probs[indices] = 0

submission = pd.DataFrame({'Id': test_data['id'], 'Sales': np.exp(test_probs) - 1})
submission.to_csv('xgboost_submission6.csv', index=False) ##0.12154

error:  0.13246705476841594


In [56]:
train_data[features].head()

Unnamed: 0,store,dayofweek,time_year,time_month,week_of_year,promo,new_promo2,schoolholiday,storetype,assortment,competitiondistance_log,is_competition
511577,676,2,2013,1,1,0,0,1,2.0,2.0,7.252054,True
554608,733,2,2013,1,1,0,0,1,2.0,2.0,6.758095,True
425970,562,2,2013,1,1,0,0,1,2.0,3.0,7.099202,True
374502,494,2,2013,1,1,0,0,1,2.0,1.0,7.13966,True
197575,262,2,2013,1,1,0,0,1,2.0,1.0,11.524666,False


In [59]:
train_data.columns

Index(['store', 'dayofweek', 'date', 'time_year', 'time_month', 'week_of_year',
       'sales', 'open', 'promo', 'stateholiday', 'schoolholiday', 'storetype',
       'assortment', 'competitiondistance', 'is_competition', 'new_promo2',
       'competitiondistance_log'],
      dtype='object')