XGBoost benchmark

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
DATA_FOLDER = '../input'
transactions    = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
items           = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))
test            = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz'))

In [None]:
train_df = test
train_df['item_category_id'] = train_df["item_id"].map(items['item_category_id'])

In [None]:
for x in range(34):
    target_df = transactions[transactions['date_block_num'] == x].groupby(['shop_id', 'item_id'], as_index=False).item_cnt_day.sum().rename(columns={"item_cnt_day": x})
    train_df = train_df.merge(target_df, how="left", on=['shop_id', 'item_id']).fillna(0.0)

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.figure(figsize=(15,10))
plt.plot(train_df.sum()[4:])
plt.axvline(x=33, color='r')
plt.axvline(x=33-9, color='g')
plt.axvline(x=33-12, color='r')
plt.axvline(x=33-21, color='g')
plt.axvline(x=33-24, color='r')
plt.axvline(x=0, color='g')

we can clearly see some seasonality here, the green line is START of training set, red line is END of training set. The prediction is red + 1.
Let's modify the df accordingly

In [None]:
train_df = test
train_df['item_category_id'] = train_df["item_id"].map(items['item_category_id'])

In [None]:
windowsize = 9
k = 12
for x in range(k, k + windowsize + 2):
    target_df = transactions[transactions['date_block_num'] == x].groupby(['shop_id', 'item_id'], as_index=False).item_cnt_day.sum().rename(columns={"item_cnt_day": x - k})
    train_df = train_df.merge(target_df, how="left", on=['shop_id', 'item_id']).fillna(0.0)

In [None]:
train_df.head()

In [None]:
plt.plot(train_df.sum()[4:])
plt.axvline(x=9, color='r')
plt.axvline(x=0, color='g')

In [None]:
train_df2 = test
train_df2['item_category_id'] = train_df2["item_id"].map(items['item_category_id'])

In [None]:
windowsize = 9
k = 0
for x in range(k, k + windowsize + 2):
    target_df = transactions[transactions['date_block_num'] == x].groupby(['shop_id', 'item_id'], as_index=False).item_cnt_day.sum().rename(columns={"item_cnt_day": x - k})
    train_df2 = train_df2.merge(target_df, how="left", on=['shop_id', 'item_id']).fillna(0.0)

In [None]:
train_df2.head()

In [None]:
plt.plot(train_df2.sum()[4:])
plt.axvline(x=9, color='r')
plt.axvline(x=0, color='g')

In [None]:
train_df = pd.concat([train_df, train_df2])

here we isolated the period previous to the prediction

<h1>Modelling

In [None]:
import xgboost as xgb

In [None]:
xgbtrain = xgb.DMatrix(train_df.iloc[:, (train_df.columns != 'ID') & (train_df.columns != 10)].values, train_df.iloc[:, (train_df.columns == 10)].values)

In [None]:
param = {'max_depth':10, 
         'subsample':1,
         'min_child_weight':0.5,
         'eta':0.3, 
         'num_round':1000, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse'}

In [None]:
bst = xgb.train(param, xgbtrain)

In [None]:
xgb.plot_importance(bst)

<h1>Predictions

In [None]:
predict_df = test
predict_df['item_category_id'] = predict_df["item_id"].map(items['item_category_id'])

In [None]:
windowsize = 9
k = 24
for x in range(k, k + windowsize + 1):
    target_df = transactions[transactions['date_block_num'] == x].groupby(['shop_id', 'item_id'], as_index=False).item_cnt_day.sum().rename(columns={"item_cnt_day": x - k})
    predict_df = predict_df.merge(target_df, how="left", on=['shop_id', 'item_id']).fillna(0.0)

In [None]:
plt.plot(predict_df.sum()[4:])
plt.axvline(x=9, color='r')
plt.axvline(x=0, color='g')

In [None]:
predict_df.head()

In [None]:
xgbpredict = xgb.DMatrix(predict_df.iloc[: , (predict_df.columns != 'ID')].values)

In [None]:
pred = bst.predict(xgbpredict)

In [None]:
plt.plot(np.append(np.array(predict_df.sum()[4:]), pred.sum()))
plt.axvline(x=9, color='r')
plt.axvline(x=0, color='g')

In [None]:
pred.sum()

In [None]:
pred = pred.clip(0, 20)

In [None]:
sub_df = pd.DataFrame({'ID':predict_df.ID,'item_cnt_month': pred })

In [None]:
sub_df.describe()

In [None]:
sub_df.to_csv('xg_boost4_cats.csv',index=False)