In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import math
from sqlalchemy import create_engine
from datetime import datetime, timedelta

In [2]:
def to_weight(y):
    w = np.zeros_like(y, dtype=float)
    ind = (y != 0)
    w[ind] = 1. / (y[ind] ** 2)
    return w

In [80]:
def rmspe(yhat, y):
    w = to_weight(y)
    return np.sqrt(np.mean(w * (y - yhat)**2)) 

def rmspe_xg(yhat, y):
    # with data after preprocessing
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = to_weight(y)
    return "rmspe", np.sqrt(np.mean(w * (y - yhat)**2)) 

In [82]:
def build_features(features, data):
    # remove Nan: 
    # Nan is from store.csv : ['CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 
    #                          'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']
    data.loc[data['Open'].isnull(), 'Open'] = 1 # thera are null in 'Open' of test.csv
    data.fillna(0, inplace=True)
    # use some features directly
    features += ['Store','CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 
                 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear']
    
    # add other features with a bit of preprocessing
    features.append('SchoolHoliday')
    data['SchoolHoliday'] = data['SchoolHoliday'].astype(float)
    
    # features.append('StateHoliday')
    # stateholiday_dict = {'0': 0, 'a': 1, 'b': 1, 'c': 1}
    # data['StateHoliday'] = data['StateHoliday'].astype(str).map(stateholiday_dict)
    
    features += ['DayOfWeek', 'month', 'day', 'year']
    data['year'] = data['Date'].apply(lambda x: x.split('-')[0]).astype(float)
    data['month'] = data['Date'].apply(lambda x: x.split('-')[1]).astype(float)
    data['day'] = data['Date'].apply(lambda x: x.split('-')[1]).astype(float)
    
    features.append('StoreType')
    storetype_dict = {'a': 1, 'b': 2, 'c': 3, 'd': 4}
    data['StoreType'] = data['StoreType'].map(storetype_dict).astype(float)
    
    features.append('Assortment')
    assortment_dict = {'a': 1, 'b': 2, 'c': 3}
    data['Assortment'] = data['Assortment'].map(assortment_dict).astype(float)
    return features, data

In [83]:
# loading data
train = pd.read_csv("input/train.csv", encoding='utf8')
test = pd.read_csv("input/test.csv", encoding='utf8')
store = pd.read_csv("input/store.csv", encoding='utf8')

  interactivity=interactivity, compiler=compiler, result=result)


In [84]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [85]:
test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0


In [86]:
train.shape

(1017209, 9)

In [87]:
# consider only data when the store is not open
train = train[train['Open'] != 0]
train.shape

(844392, 9)

In [88]:
# merge the data and the infromation about the stores
train = pd.merge(train, store, on='Store') # default : how = 'inner'
test = pd.merge(test, store, on='Store')

In [89]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
1,1,4,2015-07-30,5020,546,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
2,1,3,2015-07-29,4782,523,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
3,1,2,2015-07-28,5011,560,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
4,1,1,2015-07-27,6102,612,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,


In [90]:
features_train, train = build_features([], train)

In [91]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,year,month,day
0,1,5,2015-07-31,5263,555,1,1,0,1.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,7.0,7.0
1,1,4,2015-07-30,5020,546,1,1,0,1.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,7.0,7.0
2,1,3,2015-07-29,4782,523,1,1,0,1.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,7.0,7.0
3,1,2,2015-07-28,5011,560,1,1,0,1.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,7.0,7.0
4,1,1,2015-07-27,6102,612,1,1,0,1.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,7.0,7.0


In [92]:
features_train

['Store',
 'CompetitionDistance',
 'CompetitionOpenSinceMonth',
 'CompetitionOpenSinceYear',
 'Promo',
 'Promo2',
 'Promo2SinceWeek',
 'Promo2SinceYear',
 'SchoolHoliday',
 'DayOfWeek',
 'month',
 'day',
 'year',
 'StoreType',
 'Assortment']

In [93]:
train[features].head()

Unnamed: 0,Store,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo,Promo2,Promo2SinceWeek,Promo2SinceYear,SchoolHoliday,DayOfWeek,month,day,year,StoreType,Assortment
0,1,1270.0,9.0,2008.0,1,0,0.0,0.0,1.0,5,7.0,7.0,2015.0,3.0,1.0
1,1,1270.0,9.0,2008.0,1,0,0.0,0.0,1.0,4,7.0,7.0,2015.0,3.0,1.0
2,1,1270.0,9.0,2008.0,1,0,0.0,0.0,1.0,3,7.0,7.0,2015.0,3.0,1.0
3,1,1270.0,9.0,2008.0,1,0,0.0,0.0,1.0,2,7.0,7.0,2015.0,3.0,1.0
4,1,1270.0,9.0,2008.0,1,0,0.0,0.0,1.0,1,7.0,7.0,2015.0,3.0,1.0


In [94]:
features_test, test = build_features([], test)

In [95]:
params = {"objective": "reg:linear", "eta": 0.3, "max_depth": 8, "subsample": 0.7, "colsample_bytree": 0.7, "silent": 1}
num_trees = 300

In [96]:
train.tail()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,year,month,day
844387,292,1,2013-01-07,9291,1002,1,1,0,0.0,1.0,...,1100.0,6.0,2009.0,0,0.0,0.0,0,2013.0,1.0,1.0
844388,292,6,2013-01-05,2748,340,1,0,0,0.0,1.0,...,1100.0,6.0,2009.0,0,0.0,0.0,0,2013.0,1.0,1.0
844389,292,5,2013-01-04,4202,560,1,0,0,1.0,1.0,...,1100.0,6.0,2009.0,0,0.0,0.0,0,2013.0,1.0,1.0
844390,292,4,2013-01-03,4580,662,1,0,0,1.0,1.0,...,1100.0,6.0,2009.0,0,0.0,0.0,0,2013.0,1.0,1.0
844391,292,3,2013-01-02,5076,672,1,0,0,1.0,1.0,...,1100.0,6.0,2009.0,0,0.0,0.0,0,2013.0,1.0,1.0


In [97]:
train.sort_values(by='Date', inplace=True)
train.tail()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,year,month,day
637225,844,5,2015-07-31,6444,583,1,1,0,1.0,1.0,...,2030.0,9.0,2012.0,1,18.0,2011.0,"Feb,May,Aug,Nov",2015.0,7.0,7.0
475845,630,5,2015-07-31,7544,801,1,1,0,1.0,1.0,...,1690.0,4.0,2015.0,0,0.0,0.0,0,2015.0,7.0,7.0
323314,429,5,2015-07-31,6999,523,1,1,0,0.0,4.0,...,16350.0,7.0,2005.0,1,31.0,2013.0,"Jan,Apr,Jul,Oct",2015.0,7.0,7.0
173401,232,5,2015-07-31,5738,469,1,1,0,1.0,3.0,...,13570.0,5.0,2010.0,1,10.0,2013.0,"Mar,Jun,Sept,Dec",2015.0,7.0,7.0
0,1,5,2015-07-31,5263,555,1,1,0,1.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,7.0,7.0


In [98]:
train.shape

(844392, 21)

In [99]:
from sklearn.model_selection import train_test_split
val_size = 100000

x_train, x_val = train_test_split(train, test_size= 0.01, shuffle=False, random_state=0)

In [101]:
x_train.shape, x_val.shape

((835948, 21), (8444, 21))

In [100]:
x_val.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,year,month,day
613684,813,4,2015-07-23,5978,714,1,0,0,0.0,1.0,...,1560.0,9.0,2003.0,0,0.0,0.0,0,2015.0,7.0,7.0
60499,81,4,2015-07-23,6568,512,1,0,0,0.0,1.0,...,2370.0,3.0,2011.0,1,40.0,2014.0,"Jan,Apr,Jul,Oct",2015.0,7.0,7.0
146622,197,4,2015-07-23,5327,642,1,0,0,1.0,3.0,...,4210.0,3.0,2015.0,0,0.0,0.0,0,2015.0,7.0,7.0
93012,124,4,2015-07-23,3217,405,1,0,0,1.0,1.0,...,1410.0,4.0,2003.0,0,0.0,0.0,0,2015.0,7.0,7.0
324565,431,4,2015-07-23,8457,912,1,0,0,1.0,4.0,...,4520.0,0.0,0.0,0,0.0,0.0,0,2015.0,7.0,7.0


In [105]:
assert(features_train == features_test)

In [106]:
# import xgboost as xgb
dtrain = xgb.DMatrix(x_train[features_train], np.log(x_train['Sales'] + 1))
dval = xgb.DMatrix(x_val[features_train], np.log(x_val['Sales'] + 1))
dtest = xgb.DMatrix(test[features_test])

watchlist = [(dval, 'eval'), (dtrain, 'train')]

In [107]:
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, 
                feval=rmspe_xg, verbose_eval=True)

[0]	eval-rmse:5.86665	train-rmse:5.79323	eval-rmspe:0.997065	train-rmspe:0.996802
Multiple eval metrics have been passed: 'train-rmspe' will be used for early stopping.

Will train until train-rmspe hasn't improved in 50 rounds.
[1]	eval-rmse:4.13727	train-rmse:4.06327	eval-rmspe:0.982822	train-rmspe:0.981425
[2]	eval-rmse:2.91546	train-rmse:2.85421	eval-rmspe:0.941755	train-rmspe:0.937899
[3]	eval-rmse:2.05612	train-rmse:2.01127	eval-rmspe:0.863051	train-rmspe:0.85639
[4]	eval-rmse:1.45822	train-rmse:1.42625	eval-rmspe:0.752181	train-rmspe:0.743702
[5]	eval-rmse:1.05148	train-rmse:1.02257	eval-rmspe:0.630087	train-rmspe:0.619513
[6]	eval-rmse:0.770778	train-rmse:0.748715	eval-rmspe:0.513869	train-rmspe:0.505639
[7]	eval-rmse:0.579244	train-rmse:0.566147	eval-rmspe:0.418708	train-rmspe:0.416996
[8]	eval-rmse:0.452209	train-rmse:0.44651	eval-rmspe:0.349584	train-rmspe:0.356129
[9]	eval-rmse:0.37348	train-rmse:0.372353	eval-rmspe:0.307218	train-rmspe:0.320775
[10]	eval-rmse:0.322949	trai

[96]	eval-rmse:0.152352	train-rmse:0.152398	eval-rmspe:0.152457	train-rmspe:0.199146
[97]	eval-rmse:0.152164	train-rmse:0.152136	eval-rmspe:0.152224	train-rmspe:0.19898
[98]	eval-rmse:0.151989	train-rmse:0.151961	eval-rmspe:0.152009	train-rmspe:0.198828
[99]	eval-rmse:0.151752	train-rmse:0.151848	eval-rmspe:0.151692	train-rmspe:0.198707
[100]	eval-rmse:0.151556	train-rmse:0.151615	eval-rmspe:0.151483	train-rmspe:0.20117
[101]	eval-rmse:0.15132	train-rmse:0.151394	eval-rmspe:0.151209	train-rmspe:0.200964
[102]	eval-rmse:0.151242	train-rmse:0.151278	eval-rmspe:0.151093	train-rmspe:0.199738
[103]	eval-rmse:0.15176	train-rmse:0.15065	eval-rmspe:0.151282	train-rmspe:0.199085
[104]	eval-rmse:0.152323	train-rmse:0.150431	eval-rmspe:0.15154	train-rmspe:0.198933
[105]	eval-rmse:0.152173	train-rmse:0.150263	eval-rmspe:0.151326	train-rmspe:0.198808
[106]	eval-rmse:0.151961	train-rmse:0.150024	eval-rmspe:0.151081	train-rmspe:0.198595
[107]	eval-rmse:0.151796	train-rmse:0.149766	eval-rmspe:0.150853

[192]	eval-rmse:0.144657	train-rmse:0.138017	eval-rmspe:0.141684	train-rmspe:0.183712
[193]	eval-rmse:0.144527	train-rmse:0.137873	eval-rmspe:0.141541	train-rmspe:0.184115
[194]	eval-rmse:0.144514	train-rmse:0.137769	eval-rmspe:0.141521	train-rmspe:0.184026
[195]	eval-rmse:0.144454	train-rmse:0.137575	eval-rmspe:0.141458	train-rmspe:0.183996
[196]	eval-rmse:0.144443	train-rmse:0.137486	eval-rmspe:0.14139	train-rmspe:0.183848
[197]	eval-rmse:0.144271	train-rmse:0.137366	eval-rmspe:0.141224	train-rmspe:0.183782
[198]	eval-rmse:0.144214	train-rmse:0.137346	eval-rmspe:0.141176	train-rmspe:0.183824
[199]	eval-rmse:0.144135	train-rmse:0.137315	eval-rmspe:0.141095	train-rmspe:0.183785
[200]	eval-rmse:0.14407	train-rmse:0.137195	eval-rmspe:0.141011	train-rmspe:0.184558
[201]	eval-rmse:0.144025	train-rmse:0.137119	eval-rmspe:0.140918	train-rmspe:0.184518
[202]	eval-rmse:0.144002	train-rmse:0.13705	eval-rmspe:0.140898	train-rmspe:0.184456
[203]	eval-rmse:0.14465	train-rmse:0.136995	eval-rmspe:0.

[288]	eval-rmse:0.143012	train-rmse:0.131976	eval-rmspe:0.13892	train-rmspe:0.173079
[289]	eval-rmse:0.143023	train-rmse:0.131962	eval-rmspe:0.138931	train-rmspe:0.173063
[290]	eval-rmse:0.143043	train-rmse:0.131933	eval-rmspe:0.138923	train-rmspe:0.172426
[291]	eval-rmse:0.142948	train-rmse:0.131863	eval-rmspe:0.138852	train-rmspe:0.172345
[292]	eval-rmse:0.143031	train-rmse:0.131806	eval-rmspe:0.138883	train-rmspe:0.172293
[293]	eval-rmse:0.142989	train-rmse:0.131755	eval-rmspe:0.138826	train-rmspe:0.172155
[294]	eval-rmse:0.142934	train-rmse:0.131722	eval-rmspe:0.138748	train-rmspe:0.171472
[295]	eval-rmse:0.142883	train-rmse:0.131725	eval-rmspe:0.138686	train-rmspe:0.171009
[296]	eval-rmse:0.142893	train-rmse:0.131718	eval-rmspe:0.138702	train-rmspe:0.17101
[297]	eval-rmse:0.142905	train-rmse:0.131705	eval-rmspe:0.1387	train-rmspe:0.170001
[298]	eval-rmse:0.142906	train-rmse:0.131668	eval-rmspe:0.138698	train-rmspe:0.169916
[299]	eval-rmse:0.142872	train-rmse:0.131602	eval-rmspe:0.

In [108]:
## validation
train_probs = gbm.predict(xgb.DMatrix(x_val[features_train]))
indices = train_probs < 0
train_probs[indices] = 0

error = rmspe(np.exp(train_probs) - 1, x_val['Sales'].values)
print('error: ', error)

error:  0.13867093648804893


In [109]:
## testing
test_probs = gbm.predict(xgb.DMatrix(test[features_test]))
indices = test_probs < 0
test_probs[indices] = 0

submission = pd.DataFrame({'Id': test['Id'], 'Sales': np.exp(test_probs) - 1})
submission.to_csv('xgboost_submission.csv', index=False) # 0.11335

In [110]:
test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,year,month,day
0,1,1,4,2015-09-17,1.0,1,0,0.0,3.0,1.0,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0
1,857,1,3,2015-09-16,1.0,1,0,0.0,3.0,1.0,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0
2,1713,1,2,2015-09-15,1.0,1,0,0.0,3.0,1.0,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0
3,2569,1,1,2015-09-14,1.0,1,0,0.0,3.0,1.0,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0
4,3425,1,7,2015-09-13,0.0,0,0,0.0,3.0,1.0,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0


In [111]:
submission.head()

Unnamed: 0,Id,Sales
0,1,4599.200195
1,857,4670.44873
2,1713,5209.955078
3,2569,5170.118164
4,3425,7184.00293


In [113]:
submission2 = pd.merge(submission, test, on='Id')
submission2.head()

Unnamed: 0,Id,Sales,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,year,month,day
0,1,4599.200195,1,4,2015-09-17,1.0,1,0,0.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0
1,857,4670.44873,1,3,2015-09-16,1.0,1,0,0.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0
2,1713,5209.955078,1,2,2015-09-15,1.0,1,0,0.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0
3,2569,5170.118164,1,1,2015-09-14,1.0,1,0,0.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0
4,3425,7184.00293,1,7,2015-09-13,0.0,0,0,0.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0


In [115]:
submission2['Sales'] = submission2['Sales'] * submission2['Open']

In [116]:
submission2.head()

Unnamed: 0,Id,Sales,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,year,month,day
0,1,4599.200195,1,4,2015-09-17,1.0,1,0,0.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0
1,857,4670.44873,1,3,2015-09-16,1.0,1,0,0.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0
2,1713,5209.955078,1,2,2015-09-15,1.0,1,0,0.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0
3,2569,5170.118164,1,1,2015-09-14,1.0,1,0,0.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0
4,3425,0.0,1,7,2015-09-13,0.0,0,0,0.0,3.0,...,1270.0,9.0,2008.0,0,0.0,0.0,0,2015.0,9.0,9.0


In [117]:
submission2 = submission2.loc[:, ['Id', 'Sales']]
submission2.head()

Unnamed: 0,Id,Sales
0,1,4599.200195
1,857,4670.44873
2,1713,5209.955078
3,2569,5170.118164
4,3425,0.0


In [118]:
submission2.to_csv('xgboost_submission2.csv', index=False) #0.11335
## no differences

In [119]:
## remain data where open == 0
# loading data
train = pd.read_csv("input/train.csv", encoding='utf8')
test = pd.read_csv("input/test.csv", encoding='utf8')
store = pd.read_csv("input/store.csv", encoding='utf8')

# merge the data and the infromation about the stores
train = pd.merge(train, store, on='Store') # default : how = 'inner'
test = pd.merge(test, store, on='Store')

print(train.shape)
features_train, train = build_features([], train)
features_test, test = build_features([], test)

train.sort_values(by='Date', inplace=True)
x_train, x_val = train_test_split(train, test_size= 0.01, shuffle=False, random_state=0)
print(x_train.shape, x_val.shape)

assert(features_train == features_test)

dtrain = xgb.DMatrix(x_train[features_train], np.log(x_train['Sales'] + 1))
dval = xgb.DMatrix(x_val[features_train], np.log(x_val['Sales'] + 1))
dtest = xgb.DMatrix(test[features_test])

watchlist = [(dval, 'eval'), (dtrain, 'train')]

  interactivity=interactivity, compiler=compiler, result=result)


(1017209, 18)
(1007036, 21) (10173, 21)


In [120]:
params = {"objective": "reg:linear", "eta": 0.3, "max_depth": 8, 
          "subsample": 0.7, "colsample_bytree": 0.7, "silent": 1}
num_trees = 300

gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, 
                feval=rmspe_xg, verbose_eval=True)

[0]	eval-rmse:5.73385	train-rmse:5.70773	eval-rmspe:0.942067	train-rmspe:0.908774
Multiple eval metrics have been passed: 'train-rmspe' will be used for early stopping.

Will train until train-rmspe hasn't improved in 50 rounds.
[1]	eval-rmse:4.31261	train-rmse:4.55345	eval-rmspe:0.931044	train-rmspe:0.900945
[2]	eval-rmse:3.5226	train-rmse:3.88392	eval-rmspe:0.907248	train-rmspe:0.884078
[3]	eval-rmse:2.47707	train-rmse:2.91635	eval-rmspe:0.841792	train-rmspe:0.837234
[4]	eval-rmse:1.79755	train-rmse:2.22158	eval-rmspe:0.748513	train-rmspe:0.763337
[5]	eval-rmse:1.32205	train-rmse:1.76911	eval-rmspe:0.636455	train-rmspe:0.670341
[6]	eval-rmse:1.15277	train-rmse:1.66711	eval-rmspe:0.544826	train-rmspe:0.608278
[7]	eval-rmse:0.893905	train-rmse:1.4413	eval-rmspe:0.445406	train-rmspe:0.523385
[8]	eval-rmse:0.733167	train-rmse:1.30154	eval-rmspe:0.377064	train-rmspe:0.458929
[9]	eval-rmse:0.643599	train-rmse:1.23879	eval-rmspe:0.339589	train-rmspe:0.422168
[10]	eval-rmse:0.627101	train-rm

[96]	eval-rmse:0.415528	train-rmse:0.947932	eval-rmspe:0.230435	train-rmspe:0.292555
[97]	eval-rmse:0.415231	train-rmse:0.947619	eval-rmspe:0.230153	train-rmspe:0.292447
[98]	eval-rmse:0.415252	train-rmse:0.947533	eval-rmspe:0.230315	train-rmspe:0.292583
[99]	eval-rmse:0.415073	train-rmse:0.947164	eval-rmspe:0.229855	train-rmspe:0.291941
[100]	eval-rmse:0.414469	train-rmse:0.946929	eval-rmspe:0.228845	train-rmspe:0.291711
[101]	eval-rmse:0.414171	train-rmse:0.946663	eval-rmspe:0.228633	train-rmspe:0.291238
[102]	eval-rmse:0.413958	train-rmse:0.946556	eval-rmspe:0.228152	train-rmspe:0.291155
[103]	eval-rmse:0.41387	train-rmse:0.946511	eval-rmspe:0.227962	train-rmspe:0.291085
[104]	eval-rmse:0.413785	train-rmse:0.946273	eval-rmspe:0.228034	train-rmspe:0.290993
[105]	eval-rmse:0.413452	train-rmse:0.946064	eval-rmspe:0.228306	train-rmspe:0.290629
[106]	eval-rmse:0.413705	train-rmse:0.945965	eval-rmspe:0.228924	train-rmspe:0.290752
[107]	eval-rmse:0.413657	train-rmse:0.945896	eval-rmspe:0.2

In [121]:
## validation
train_probs = gbm.predict(xgb.DMatrix(x_val[features_train]))
indices = train_probs < 0
train_probs[indices] = 0

error = rmspe(np.exp(train_probs) - 1, x_val['Sales'].values)
print('error: ', error)

error:  0.22382188627580968


In [122]:
## testing
test_probs = gbm.predict(xgb.DMatrix(test[features_test]))
indices = test_probs < 0
test_probs[indices] = 0

submission = pd.DataFrame({'Id': test['Id'], 'Sales': np.exp(test_probs) - 1})
submission.to_csv('xgboost_submission3.csv', index=False) # 0.23291