In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time

from sklearn.preprocessing import LabelEncoder
import scipy.stats as ss

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 

%matplotlib inline

In [111]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

test.set_index('ID', inplace=True)
item.drop(['item_name_translated'], axis=1, inplace=True)
shop.drop(['Name'], axis=1, inplace=True)

le = LabelEncoder()
item['item_cat1'] = le.fit_transform(item['item_cat1'].astype(str))
item['item_cat2'] = le.fit_transform(item['item_cat2'].astype(str))
shop['Ciyt'] = le.fit_transform(shop['City'])
shop['Type'] = le.fit_transform(shop['Type'])

In [112]:
p_df = train.pivot_table(index=['shop_id', 'item_id'],
                        columns=['date_block_num'],
                        values='item_cnt_day',
                        aggfunc='sum').fillna(0.0)

In [113]:
u = test.join(p_df, on=['shop_id', 'item_id'])

cols = ['m{}'.format(n) if isinstance(n, (int)) else n for n in u.columns ]

u.columns = cols

u['m34'] = np.zeros(u.shape[0])

u = u.merge(item, how='left', on='item_id')
u = u.merge(shop, how='left', on='shop_id')

le = LabelEncoder()
u['item_cat1'] = le.fit_transform(u['item_cat1'])
u['item_cat2'] = le.fit_transform(u['item_cat2'].astype(str))
u['City'] = le.fit_transform(u['City'])
u['Type'] = le.fit_transform(u['Type'])

cols = u.columns.tolist()
cols.append('shop_id')
cols.append('item_id')
del cols[0]
del cols[0]

u = u[cols]

In [114]:
u.head()

Unnamed: 0,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,...,m32,m33,m34,item_cat1,item_cat2,City,Type,Ciyt,shop_id,item_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,7,28,19,4,22,5,5037
1,,,,,,,,,,,...,,,0.0,10,6,19,4,22,5,5320
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,0.0,7,28,19,4,22,5,5233
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7,42,19,4,22,5,5232
4,,,,,,,,,,,...,,,0.0,7,29,19,4,22,5,5268


In [109]:
v = train.merge(item)

In [110]:
v.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_cat1,item_cat2
0,02.01.2013,0,59,22154,999.0,1.0,Cinema,Blu-Ray
1,23.01.2013,0,24,22154,999.0,1.0,Cinema,Blu-Ray
2,20.01.2013,0,27,22154,999.0,1.0,Cinema,Blu-Ray
3,02.01.2013,0,25,22154,999.0,1.0,Cinema,Blu-Ray
4,03.01.2013,0,25,22154,999.0,1.0,Cinema,Blu-Ray


In [90]:
def clip_rmse(preds, dtrain):
    y_true = dtrain.get_label()
    preds = np.array(list(map(lambda x: min(20, max(x, 0)), list(preds))))
    rmse = np.sqrt(mean_squared_error(preds,y_test))
    return 'clip-rmse', rmse

In [101]:
m = 22 # for CV, m<=33
p = 21 # p<=m-1

col_x_train = ['m{}'.format(n) for n in list(np.arange(m-p-1, m-1))]
col_x_train.extend(['item_cat1', 'item_cat2', 'City', 
                    'Type', 'shop_id', 'item_id'])

col_x_test = ['m{}'.format(n) for n in list(np.arange(m-p, m))] 
col_x_test.extend(['item_cat1', 'item_cat2', 'City', 'Type', 'shop_id', 'item_id'])

x_train = u[col_x_train].values
y_train = u.loc[:, 'm{}'.format(m-1)].values
x_test = u[col_x_test].values
y_test = u.loc[:, 'm{}'.format(m)].values

param = {'max_depth':14, 
         'subsample':1,
         'min_child_weight':0.5,
         'eta':0.3, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse',
         'maximize': False}

progress = dict()
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)
watchlist  = [(dtrain,'train'), (dtest, 'test')]

bst = xgb.train(param, dtrain, evals=watchlist,
               evals_result=progress,
               verbose_eval=True,
               num_boost_round=20)

preds = bst.predict(xgb.DMatrix(x_test))
rmse = np.sqrt(mean_squared_error(preds,y_test))
print(rmse)

[0]	train-rmse:2.36458	test-rmse:3.71616
[1]	train-rmse:1.86323	test-rmse:3.38712
[2]	train-rmse:1.50702	test-rmse:3.18421
[3]	train-rmse:1.25271	test-rmse:3.0547
[4]	train-rmse:1.06992	test-rmse:2.96865
[5]	train-rmse:0.947622	test-rmse:2.91552
[6]	train-rmse:0.854787	test-rmse:2.87538
[7]	train-rmse:0.785011	test-rmse:2.83455
[8]	train-rmse:0.734975	test-rmse:2.8177
[9]	train-rmse:0.691638	test-rmse:2.80153
[10]	train-rmse:0.658015	test-rmse:2.79206
[11]	train-rmse:0.632156	test-rmse:2.78417
[12]	train-rmse:0.614861	test-rmse:2.77905
[13]	train-rmse:0.60339	test-rmse:2.77514
[14]	train-rmse:0.592754	test-rmse:2.77213
[15]	train-rmse:0.584691	test-rmse:2.76978
[16]	train-rmse:0.580312	test-rmse:2.76818
[17]	train-rmse:0.560963	test-rmse:2.767
[18]	train-rmse:0.548344	test-rmse:2.76608
[19]	train-rmse:0.544969	test-rmse:2.76558
2.7655813772816105


In [86]:
preds = list(map(lambda x: min(20, max(x, 0)), list(preds)))
sub_df = pd.DataFrame({'ID': u.index, 'item_cnt_month':preds})
sub_df.to_csv('eda_7.csv', index=False)