In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time

from sklearn.preprocessing import LabelEncoder
import scipy.stats as ss

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 

%matplotlib inline

In [2]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

In [3]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


### `p_df`

In [4]:
p_df = train.pivot_table(index=['shop_id','item_id'], 
                            columns='date_block_num', 
                            values='item_cnt_day',
                            aggfunc='sum').fillna(0.0)

In [35]:
p_df = p_df.reset_index()

In [37]:
p_df = p_df.merge(item, how='inner', on='item_id')

In [39]:
p_df.drop(['item_name_translated'], axis=1, inplace=True)

In [41]:
le = LabelEncoder()
p_df['item_cat1'] = le.fit_transform(p_df['item_cat1'])
p_df['item_cat2'] = p_df['item_cat2'].astype(str)
p_df['item_cat2'] = le.fit_transform(p_df['item_cat2'])

### `p_df2`

In [46]:
train['value'] = train.item_price*train.item_cnt_day

In [47]:
p_df2 = train.pivot_table(index=['shop_id', 'item_id'], 
                         columns=['date_block_num'],
                         values=['item_cnt_day', 'value'],
                         aggfunc={'item_cnt_day': np.sum, 'value': np.sum}).fillna(0.0)

In [48]:
p_df2.columns = list(map(lambda x: str(x[0])+'_'+str(x[1]), 
                         zip(list(p_df2.columns.get_level_values(0)), 
                             list(p_df2.columns.get_level_values(1)))))

In [49]:
p_df2.reset_index(inplace=True)

In [50]:
p_df2 = p_df2.merge(item, how='inner', on='item_id')

In [52]:
p_df2.drop(['item_name_translated'], axis=1, inplace=True)

### Train model

In [54]:
import xgboost as xgb
param = {'max_depth':10, 
         'subsample':1,
         'min_child_weight':0.5,
         'eta':0.3, 
         'num_round':1000, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse'}

progress = dict()
xgbtrain = xgb.DMatrix(p_df.iloc[:,  (p_df.columns != 33)].values, p_df.iloc[:, p_df.columns == 33].values)
watchlist  = [(xgbtrain,'train'), (xgbtrain,'test')]

bst = xgb.train(param, xgbtrain, verbose_eval=True, evals=watchlist)
preds = bst.predict(xgb.DMatrix(p_df.iloc[:,  (p_df.columns != 33)].values))
rmse = np.sqrt(mean_squared_error(preds,p_df.iloc[:, p_df.columns == 33].values))
print(rmse)

[0]	train-rmse:3.40191	test-rmse:3.40191
[1]	train-rmse:2.91596	test-rmse:2.91596
[2]	train-rmse:2.52126	test-rmse:2.52126
[3]	train-rmse:2.19668	test-rmse:2.19668
[4]	train-rmse:1.93769	test-rmse:1.93769
[5]	train-rmse:1.72973	test-rmse:1.72973
[6]	train-rmse:1.56816	test-rmse:1.56816
[7]	train-rmse:1.43958	test-rmse:1.43958
[8]	train-rmse:1.34044	test-rmse:1.34044
[9]	train-rmse:1.2652	test-rmse:1.2652
1.2652050841170301


In [56]:
t_df = test.merge(p_df, how='left', on=['shop_id', 'item_id']).fillna(0.0)

In [57]:
t_df.head()

Unnamed: 0,ID,shop_id,item_id,0,1,2,3,4,5,6,...,26,27,28,29,30,31,32,33,item_cat1,item_cat2
0,0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0,9.0,42.0
1,1,5,5320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0,9.0,42.0
3,3,5,5232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9.0,58.0
4,4,5,5268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
cols = t_df.columns.tolist()
cols.insert(3, 'item_cat1')
cols.insert(4, 'item_cat2')
cols.pop()
cols.pop()

'item_cat1'

In [59]:
t_df = t_df[cols]

In [60]:
d = dict(zip(t_df.columns[5:],list(np.array(list(t_df.columns[5:])) - 1)))

In [61]:
t_df = t_df.rename(d, axis=1)

In [28]:
preds = bst.predict(xgb.DMatrix(t_df.iloc[:, (t_df.columns != 'ID') & (t_df.columns != -1)].values))

In [75]:
preds = list(map(lambda x: min(20,max(x,0)), list(preds)))
sub_df = pd.DataFrame({'ID':t_df.ID,'item_cnt_month': preds })
sub_df.describe()

Unnamed: 0,ID,item_cnt_month
count,214200.0,214200.0
mean,107099.5,1.346495
std,61834.358168,1.01497
min,0.0,0.0
25%,53549.75,0.848627
50%,107099.5,1.052436
75%,160649.25,2.146009
max,214199.0,20.0


In [18]:
sub_df.to_csv('xg_boost4_cats.csv',index=False)

In [64]:
def split_data(x, m):
    '''
    Split data into x_train, y_train, x_test, and y_test
    m is the number of months back, m<=0
    m=0 means train and predict for LB
    m=-1 means train on the first 32 months and test on the 33rd month
    m=-2 means train on the first 31 months and test on the 32nd month
    m=-3 means...
    '''
    
    x_train = x.iloc[:, :m-1].values
    y_train = x.iloc[:, m-1].values
    y_test = x.iloc[:, m].values
    x_test = x.iloc[:, :m]
    x_test.drop(-1, axis=1, inplace=True)
    x_test = x_test.values
    
    return x_train, y_train, x_test, y_test

def cross_val_predict_skf_rm_xgb(params, x, num_boost_round=3, n_repeats=2, random_state=3795264, verbose_eval=False):
    '''
    CV with repeated models
    verbose_eval is the same as in xgb.train
    '''
    cv_results = {}
    clfs = {}
    running_time = {}
    
    np.random.seed(random_state)
    
    for m in range(1, n_repeats+1):
        start_time = time.time()

        # split columns into x_train, y_train, x_test, and y_test
        x_train, y_train, x_test, y_test = split_data(x, -m)
        
        # Construct DMatrix
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dtest = xgb.DMatrix(x_test, label=y_test)

        # Placeholder for evals_result
        cv_results[m] = {}
        params['seed'] = np.random.randint(10**6)
        clfs[m] = xgb.train(params, dtrain, num_boost_round=num_boost_round,
                            evals=[(dtrain, 'train'), (dtest, 'test')],
                            early_stopping_rounds=None, 
                            evals_result=cv_results[m], 
                            verbose_eval=verbose_eval)

        running_time[m] = time.time() - start_time
        
        print('Repeat {}, split {}, test RMSE = {:.3f}, running time = {:.3f} min'.format(m, 
            cv_results[m]['test']['rmse'][-1], running_time[m]/60))
        
    # Post-process cv_results
    cv_results_final = {}
    for m in range(n_repeats):
        cv_results_final['train', m] = cv_results[m]['train']['rmse']
        cv_results_final['test', m] = cv_results[m]['test']['rmse']
    
    df = pd.DataFrame.from_dict(cv_results_final)
    df.index.name = 'iteration'
    df.columns.names = ['dataset', 'repeat', 'split']

    print('Score mean = {:.3f}, std = {:.3f}'.format(df['test'].iloc[-1].mean(), df['test'].iloc[-1].std()))
    
    return df, clfs, running_time


In [65]:
params = {'max_depth':10, 
         'subsample':1,
         'min_child_weight':0.5,
         'eta':0.3, 
         'num_round':1000, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse'}#,
#          'objective':'reg:linear'}

results = cross_val_predict_skf_rm_xgb(params, t_df, 
                                      num_boost_round=100, 
                                      n_repeats=2, 
                                      random_state=3795264, 
                                      verbose_eval=True)

[0]	train-rmse:2.02246	test-rmse:5.32084
[1]	train-rmse:1.68056	test-rmse:5.18676
[2]	train-rmse:1.43236	test-rmse:5.10703
[3]	train-rmse:1.28184	test-rmse:5.05734
[4]	train-rmse:1.15444	test-rmse:5.04191
[5]	train-rmse:1.07867	test-rmse:5.02215
[6]	train-rmse:1.02033	test-rmse:5.01034
[7]	train-rmse:0.984214	test-rmse:5.00742
[8]	train-rmse:0.951919	test-rmse:4.99778
[9]	train-rmse:0.935922	test-rmse:4.98894
[10]	train-rmse:0.912274	test-rmse:4.98806
[11]	train-rmse:0.89823	test-rmse:4.98816
[12]	train-rmse:0.879131	test-rmse:4.98323
[13]	train-rmse:0.874732	test-rmse:4.98062
[14]	train-rmse:0.859176	test-rmse:4.97911
[15]	train-rmse:0.851925	test-rmse:4.97821
[16]	train-rmse:0.81559	test-rmse:4.98264
[17]	train-rmse:0.801638	test-rmse:4.98434
[18]	train-rmse:0.786843	test-rmse:4.98588
[19]	train-rmse:0.783377	test-rmse:4.98563
[20]	train-rmse:0.78107	test-rmse:4.98521
[21]	train-rmse:0.769257	test-rmse:4.98663
[22]	train-rmse:0.768238	test-rmse:4.98758
[23]	train-rmse:0.763534	test-r