In [99]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import operator
import matplotlib
matplotlib.use("Agg") #Needed to save figures
import matplotlib.pyplot as plt

In [100]:
def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

In [101]:
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

In [102]:
def build_features(features, data):
    
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)
    
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    
    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear
    
    # CompetitionOpenSinceYear 和 CompetitionOpenSinceMonth为空使用平均值
    features.append('CompetitionOpen')
    
    data.loc[data.CompetitionOpenSinceYear == 0, 'CompetitionOpenSinceYear'] = data.loc[data.CompetitionOpenSinceYear != 0, 'CompetitionOpenSinceYear'].mean()
    data.loc[data.CompetitionOpenSinceMonth == 0, 'CompetitionOpenSinceMonth'] = data.loc[data.CompetitionOpenSinceMonth != 0, 'CompetitionOpenSinceMonth'].mean()
#     data.loc[data.CompetitionOpenSinceYear == 0, 'CompetitionOpenSinceYear'] = data.loc[data.CompetitionOpenSinceYear != 0, 'CompetitionOpenSinceYear'].median()
#     data.loc[data.CompetitionOpenSinceMonth == 0, 'CompetitionOpenSinceMonth'] = data.loc[data.CompetitionOpenSinceMonth != 0, 'CompetitionOpenSinceMonth'].median()
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + (data.Month - data.CompetitionOpenSinceMonth)
    
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) +  (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
#     data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0
    
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1
    
    return data


In [103]:
def train_model(features, train):
    params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
    num_boost_round = 300

    print("Train a XGBoost model")
    X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
    y_train = np.log1p(X_train.Sales)
    y_valid = np.log1p(X_valid.Sales)
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)
    return gbm

In [104]:
# importing train data to learn
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("./dataset/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("./dataset/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("./dataset/store.csv")

train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

train = train[(train["Open"] != 0) & (train['Sales'] != 0)]

train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')
features = []
build_features(features, train)
build_features([], test)
gbm = train_model(features, train)


Train a XGBoost model
[0]	train-rmse:5.79341	eval-rmse:5.79423	train-rmspe:0.996844	eval-rmspe:0.996849
Multiple eval metrics have been passed: 'eval-rmspe' will be used for early stopping.

Will train until eval-rmspe hasn't improved in 100 rounds.
[1]	train-rmse:4.06321	eval-rmse:4.0639	train-rmspe:0.981494	eval-rmspe:0.981516
[2]	train-rmse:2.85437	eval-rmse:2.85483	train-rmspe:0.937932	eval-rmspe:0.937958
[3]	train-rmse:2.01143	eval-rmse:2.012	train-rmspe:0.856279	eval-rmspe:0.856213
[4]	train-rmse:1.42532	eval-rmse:1.42591	train-rmspe:0.743553	eval-rmspe:0.742963
[5]	train-rmse:1.01953	eval-rmse:1.02008	train-rmspe:0.619224	eval-rmspe:0.617309
[6]	train-rmse:0.74052	eval-rmse:0.741247	train-rmspe:0.503908	eval-rmspe:0.499385
[7]	train-rmse:0.553154	eval-rmse:0.553897	train-rmspe:0.411826	eval-rmspe:0.403275
[8]	train-rmse:0.430237	eval-rmse:0.431113	train-rmspe:0.347982	eval-rmspe:0.334461
[9]	train-rmse:0.355562	eval-rmse:0.356691	train-rmspe:0.312717	eval-rmspe:0.294494
[10]	tra

[96]	train-rmse:0.104438	eval-rmse:0.113741	train-rmspe:0.157023	eval-rmspe:0.125427
[97]	train-rmse:0.104175	eval-rmse:0.11356	train-rmspe:0.156763	eval-rmspe:0.125247
[98]	train-rmse:0.103985	eval-rmse:0.113428	train-rmspe:0.156677	eval-rmspe:0.125095
[99]	train-rmse:0.103596	eval-rmse:0.113111	train-rmspe:0.156135	eval-rmspe:0.124769
[100]	train-rmse:0.10331	eval-rmse:0.112944	train-rmspe:0.154634	eval-rmspe:0.124586
[101]	train-rmse:0.103039	eval-rmse:0.112801	train-rmspe:0.154395	eval-rmspe:0.124441
[102]	train-rmse:0.102648	eval-rmse:0.112493	train-rmspe:0.154076	eval-rmspe:0.124097
[103]	train-rmse:0.101972	eval-rmse:0.111912	train-rmspe:0.146336	eval-rmspe:0.12349
[104]	train-rmse:0.10175	eval-rmse:0.111765	train-rmspe:0.146165	eval-rmspe:0.123336
[105]	train-rmse:0.101492	eval-rmse:0.11156	train-rmspe:0.1459	eval-rmspe:0.12312
[106]	train-rmse:0.101307	eval-rmse:0.11143	train-rmspe:0.145746	eval-rmspe:0.122955
[107]	train-rmse:0.100778	eval-rmse:0.111	train-rmspe:0.145231	eval

[192]	train-rmse:0.084197	eval-rmse:0.10032	train-rmspe:0.11044	eval-rmspe:0.111261
[193]	train-rmse:0.084092	eval-rmse:0.100296	train-rmspe:0.110351	eval-rmspe:0.111212
[194]	train-rmse:0.083866	eval-rmse:0.100156	train-rmspe:0.10733	eval-rmspe:0.111045
[195]	train-rmse:0.083734	eval-rmse:0.10009	train-rmspe:0.105284	eval-rmspe:0.11099
[196]	train-rmse:0.083548	eval-rmse:0.099955	train-rmspe:0.105125	eval-rmspe:0.11087
[197]	train-rmse:0.08345	eval-rmse:0.099938	train-rmspe:0.104707	eval-rmspe:0.110838
[198]	train-rmse:0.083286	eval-rmse:0.09985	train-rmspe:0.104558	eval-rmspe:0.110755
[199]	train-rmse:0.083177	eval-rmse:0.099792	train-rmspe:0.104459	eval-rmspe:0.110704
[200]	train-rmse:0.083067	eval-rmse:0.099768	train-rmspe:0.104394	eval-rmspe:0.110681
[201]	train-rmse:0.083003	eval-rmse:0.099743	train-rmspe:0.104298	eval-rmspe:0.110655
[202]	train-rmse:0.082885	eval-rmse:0.099694	train-rmspe:0.103741	eval-rmspe:0.110603
[203]	train-rmse:0.08284	eval-rmse:0.099676	train-rmspe:0.1037

[288]	train-rmse:0.074512	eval-rmse:0.096244	train-rmspe:0.081277	eval-rmspe:0.107127
[289]	train-rmse:0.074418	eval-rmse:0.096197	train-rmspe:0.08118	eval-rmspe:0.107075
[290]	train-rmse:0.074374	eval-rmse:0.0962	train-rmspe:0.081129	eval-rmspe:0.10707
[291]	train-rmse:0.074285	eval-rmse:0.096179	train-rmspe:0.081026	eval-rmspe:0.107048
[292]	train-rmse:0.074186	eval-rmse:0.096145	train-rmspe:0.080853	eval-rmspe:0.107031
[293]	train-rmse:0.074122	eval-rmse:0.096138	train-rmspe:0.080786	eval-rmspe:0.107017
[294]	train-rmse:0.074036	eval-rmse:0.096109	train-rmspe:0.080666	eval-rmspe:0.106976
[295]	train-rmse:0.073964	eval-rmse:0.096107	train-rmspe:0.080578	eval-rmspe:0.106979
[296]	train-rmse:0.073904	eval-rmse:0.096101	train-rmspe:0.080516	eval-rmspe:0.106968
[297]	train-rmse:0.073852	eval-rmse:0.096093	train-rmspe:0.080462	eval-rmspe:0.106978
[298]	train-rmse:0.073808	eval-rmse:0.096084	train-rmspe:0.080418	eval-rmspe:0.106967
[299]	train-rmse:0.073777	eval-rmse:0.096072	train-rmspe:0

In [105]:
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("xgboost_10_submission.csv", index=False)

In [106]:
# [299]	train-rmse:0.075718	eval-rmse:0.092818	train-rmspe:0.0844	eval-rmspe:0.097642
# train-rmse:0.0745	eval-rmse:0.091301	train-rmspe:0.080346	eval-rmspe:0.09595