In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time

from sklearn.preprocessing import LabelEncoder
import scipy.stats as ss

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 
from numba import jit

%matplotlib inline

In [2]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

test.set_index('ID', inplace=True)
item.drop(['item_name_translated'], axis=1, inplace=True)
shop.drop(['Name'], axis=1, inplace=True)

le = LabelEncoder()
item['item_cat1'] = le.fit_transform(item['item_cat1'].astype(str))
item['item_cat2'] = le.fit_transform(item['item_cat2'].astype(str))
shop['Ciyt'] = le.fit_transform(shop['City'])
shop['Type'] = le.fit_transform(shop['Type'])

In [10]:
p_df = train.pivot_table(index=['shop_id', 'item_id'],
                        columns=['date_block_num'],
                        values='item_cnt_day',
                        aggfunc='sum').fillna(0.0)

In [11]:
u = test.join(p_df, on=['shop_id', 'item_id']).fillna(0.0)

cols = ['m{}'.format(n) if isinstance(n, (int)) else n for n in u.columns ]

u.columns = cols

u['m34'] = np.zeros(u.shape[0])

u = u.merge(item, how='left', on='item_id')
u = u.merge(shop, how='left', on='shop_id')

le = LabelEncoder()
u['item_cat1'] = le.fit_transform(u['item_cat1'])
u['item_cat2'] = le.fit_transform(u['item_cat2'].astype(str))
u['City'] = le.fit_transform(u['City'])
u['Type'] = le.fit_transform(u['Type'])

cols = u.columns.tolist()
cols.append('shop_id')
cols.append('item_id')
del cols[0]
del cols[0]

u = u[cols]

In [30]:
def clip_rmse(preds, dtrain):
    y_test = np.array(dtrain.get_label())
    preds = np.array(preds)
    y_test = np.maximum(np.minimum(y_test, 20), 0)
    preds = np.maximum(np.minimum(preds, 20), 0)
    #preds = np.array(list(map(lambda x: min(20, max(x, 0)), list(preds))))
    #y_test = np.array(list(map(lambda x: min(20, max(x, 0)), list(y_test))))
    rmse = np.sqrt(mean_squared_error(preds,y_test))
    return 'clip-rmse', rmse

In [31]:
m = 33 # for CV, m<=33
p = 32 # p<=m-1

col_x_train = ['m{}'.format(n) for n in list(np.arange(m-p-1, m-1))]
col_x_train.extend(['item_cat1', 'item_cat2', 'City', 
                    'Type', 'shop_id', 'item_id'])

col_x_test = ['m{}'.format(n) for n in list(np.arange(m-p, m))] 
col_x_test.extend(['item_cat1', 'item_cat2', 'City', 'Type', 'shop_id', 'item_id'])

x_train = u[col_x_train].values
y_train = u.loc[:, 'm{}'.format(m-1)].values
x_test = u[col_x_test].values
y_test = u.loc[:, 'm{}'.format(m)].values

param = {'max_depth':14, 
         'subsample':1,
         'min_child_weight':0.5,
         'eta':0.3, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse',
         'maximize': False}

progress = dict()
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)
watchlist  = [(dtrain,'train'), (dtest, 'test')]

bst = xgb.train(param, dtrain, evals=watchlist,
               evals_result=progress,
               verbose_eval=True,
               num_boost_round=5,
               feval=clip_rmse)

preds = bst.predict(xgb.DMatrix(x_test))
rmse = np.sqrt(mean_squared_error(preds,y_test))
print(rmse)

[0.71887755 0.38705453 0.46932155 ... 0.36601526 0.36601526 0.39828765]
[1. 0. 3. ... 0. 0. 0.]
[0.61410713 0.38705453 0.71887755 ... 0.39824027 0.36601526 0.35543844]
[0. 0. 1. ... 0. 0. 0.]
[0]	train-rmse:1.9713	test-rmse:5.33783	train-clip-rmse:0.892926	test-clip-rmse:0.988395
[0.8185612  0.29153898 1.13252    ... 0.2866662  0.26961002 0.30205122]
[1. 0. 3. ... 0. 0. 0.]
[0.7486584  0.29153898 0.9125807  ... 0.31889123 0.26961002 0.259202  ]
[0. 0. 1. ... 0. 0. 0.]
[1]	train-rmse:1.60052	test-rmse:5.20713	train-clip-rmse:0.763827	test-clip-rmse:0.951249
[0.91566676 0.23168734 1.0896486  ... 0.23318593 0.20129761 0.23010217]
[1. 0. 3. ... 0. 0. 0.]


KeyboardInterrupt: 

In [32]:
a = np.array([0, 1, 2, 3, 4, 5])
np.minimum(a, 3)

array([0, 1, 2, 3, 3, 3])

In [86]:
preds = list(map(lambda x: min(20, max(x, 0)), list(preds)))
sub_df = pd.DataFrame({'ID': u.index, 'item_cnt_month':preds})
sub_df.to_csv('eda_7.csv', index=False)