In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time

from sklearn.preprocessing import LabelEncoder
import scipy.stats as ss

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 

%matplotlib inline

In [2]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

In [3]:
x = train.pivot_table(index=['shop_id', 'item_id'],
                     columns=['date_block_num'],
                     values='item_cnt_day',
                     aggfunc='sum').fillna(0.0)

In [4]:
x.reset_index(inplace=True, drop=False)

In [5]:
x = x.merge(item, how='inner', on='item_id')
x.drop(['item_name_translated'], axis=1, inplace=True)
le = LabelEncoder()
x['item_cat1'] = le.fit_transform(x['item_cat1'])
x['item_cat2'] = x['item_cat2'].astype(str)
x['item_cat2'] = le.fit_transform(x['item_cat2'])

In [6]:
cols = x.columns

cols = cols.insert(2, cols[-2])
cols = cols.insert(3, cols[-1])

cols = cols[:-2]

x = x[cols]

In [7]:
x = x.merge(shop, how='inner', on='shop_id')

In [8]:
x.drop(['Name'], axis=1, inplace=True)

In [9]:
x['City'] = le.fit_transform(x['City'])
x['Type'] = le.fit_transform(x['Type'])

In [10]:
cols = x.columns

cols = cols.insert(4, 'City')
cols = cols.insert(5, 'Type')
cols = list(cols)
cols.pop()
cols.pop()

'City'

In [11]:
x = x[cols]

In [12]:
x.head()

Unnamed: 0,shop_id,item_id,item_cat1,item_cat2,City,Type,0,1,2,3,...,24,25,26,27,28,29,30,31,32,33
0,0,30,3,23,26,3,0.0,31.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,31,3,9,26,3,0.0,11.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,32,3,23,26,3,6.0,10.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,33,3,9,26,3,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,35,3,23,26,3,1.0,14.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
m = -1

In [14]:
x_train = x.iloc[:, :m-1].values

In [15]:
y_train = x.iloc[:, m-1].values

In [16]:
x_test = x.iloc[:, :m]
x_test.drop(0, axis=1, inplace=True)
x_test = x_test.values

In [17]:
y_test = x.iloc[:, m].values

In [18]:
param = {'max_depth':10, 
         'subsample':1,
         'min_child_weight':0.5,
         'eta':0.3, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse',
         'maximize': False}

progress = dict()
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)
watchlist  = [(dtrain,'train'), (dtest, 'test')]

bst = xgb.train(param, dtrain, evals=watchlist,
               evals_result=progress,
               verbose_eval=True)

preds = bst.predict(xgb.DMatrix(x_test))
rmse = np.sqrt(mean_squared_error(preds,y_test))
print(rmse)

[0]	train-rmse:4.09632	test-rmse:4.14596
[1]	train-rmse:3.453	test-rmse:4.49112
[2]	train-rmse:2.93071	test-rmse:4.96605
[3]	train-rmse:2.50118	test-rmse:5.46545
[4]	train-rmse:2.15515	test-rmse:5.93497
[5]	train-rmse:1.83219	test-rmse:6.46481
[6]	train-rmse:1.5583	test-rmse:6.92911
[7]	train-rmse:1.3606	test-rmse:7.32447
[8]	train-rmse:1.2081	test-rmse:7.66303
[9]	train-rmse:1.0915	test-rmse:7.95024
7.950239584511172


### Test 

In [21]:
p_df = train.pivot_table(index=['shop_id','item_id'], 
                            columns='date_block_num', 
                            values='item_cnt_day',
                            aggfunc='sum').fillna(0.0)

In [22]:
item.set_index('item_id', inplace=True)
item.drop(['item_name_translated'], axis=1, inplace=True)

In [23]:
p_df = p_df.join(item, on='item_id')

In [24]:
shop.set_index('shop_id', inplace=True)
shop.drop(['Name'], axis=1, inplace=True)
shop.head()

Unnamed: 0_level_0,City,Type
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Yakutsk,Shop
1,Yakutsk,TC
2,Adygea,TC
3,Balashikha,TRC
4,Volzhsky,TC


In [25]:
p_df = p_df.join(shop, on='shop_id')

In [26]:
p_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,item_cat1,item_cat2,City,Type
shop_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,30,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Cinema,DVD,Yakutsk,Shop
0,31,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Cinema,Blu-Ray,Yakutsk,Shop
0,32,6.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Cinema,DVD,Yakutsk,Shop
0,33,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Cinema,Blu-Ray,Yakutsk,Shop
0,35,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Cinema,DVD,Yakutsk,Shop


In [27]:
p_df['item_cat1'] = le.fit_transform(p_df['item_cat1'])

In [28]:
p_df['item_cat2'] = le.fit_transform(p_df['item_cat2'].astype(str))

In [29]:
p_df['City'] = le.fit_transform(p_df['City'])

In [30]:
p_df['Type'] = le.fit_transform(p_df['Type'])

In [31]:
p_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,item_cat1,item_cat2,City,Type
shop_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,30,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3,23,26,3
0,31,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3,9,26,3
0,32,6.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3,23,26,3
0,33,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3,9,26,3
0,35,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3,23,26,3


In [32]:
m = 33 # for CV, m<=33
p = 32 # p<=m-1

col_x_train = list(np.arange(m-p-1, m-1))
col_x_train.extend(['item_cat1', 'item_cat2', 'City', 'Type'])

col_x_test = list(np.arange(m-p, m))
col_x_test.extend(['item_cat1', 'item_cat2', 'City', 'Type'])

x_train = p_df[col_x_train].values
y_train = p_df.loc[:, m-1].values
x_test = p_df[col_x_test].values
y_test = p_df.loc[:, m].values

In [33]:
# rename = dict(zip(x_test.columns[:p], list(np.array(x_test.columns[:p])-1)))

# x_test = x_test.rename(rename, axis=1)

In [34]:
param = {'max_depth':10, 
         'subsample':1,
         'min_child_weight':0.5,
         'eta':0.3, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse',
         'maximize': False}

progress = dict()
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)
watchlist  = [(dtrain,'train'), (dtest, 'test')]

bst = xgb.train(param, dtrain, evals=watchlist,
               evals_result=progress,
               verbose_eval=True)

preds = bst.predict(xgb.DMatrix(x_test))
rmse = np.sqrt(mean_squared_error(preds,y_test))
print(rmse)

[0]	train-rmse:4.10918	test-rmse:4.00974
[1]	train-rmse:3.46882	test-rmse:4.15945
[2]	train-rmse:2.94932	test-rmse:4.4115
[3]	train-rmse:2.52824	test-rmse:4.68908
[4]	train-rmse:2.18851	test-rmse:4.95246
[5]	train-rmse:1.8722	test-rmse:5.28213
[6]	train-rmse:1.62718	test-rmse:5.57608
[7]	train-rmse:1.43969	test-rmse:5.83164
[8]	train-rmse:1.29715	test-rmse:6.04641
[9]	train-rmse:1.18977	test-rmse:6.22651
6.226507770253472


In [35]:
m = 34
p = 33

col_x_train = list(np.arange(m-p-1, m-1))
col_x_train.extend(['item_cat1', 'item_cat2', 'City', 'Type'])

In [36]:
col_x_test = list(np.arange(m-p, m))
col_x_test.extend(['item_cat1', 'item_cat2', 'City', 'Type'])

In [37]:
x_train = p_df[col_x_train].values
y_train = p_df.loc[:, m-1].values
x_test = p_df[col_x_test].values
#y_test = p_df.loc[:, m].values

In [40]:
# test.set_index('ID', inplace=True)
# test = test.join(item, '')

In [41]:
param = {'max_depth':10, 
         'subsample':1,
         'min_child_weight':0.5,
         'eta':0.3, 
         'num_round':1000, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse'}

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)
watchlist = [(dtrain, 'train'), (dtrain, 'test')]

bst = xgb.train(param, dtrain, verbose_eval=True, evals=watchlist)
preds = bst.predict(dtest)


[0]	train-rmse:3.407	test-rmse:3.407
[1]	train-rmse:2.91052	test-rmse:2.91052
[2]	train-rmse:2.50897	test-rmse:2.50897
[3]	train-rmse:2.18125	test-rmse:2.18125
[4]	train-rmse:1.91858	test-rmse:1.91858
[5]	train-rmse:1.70451	test-rmse:1.70451
[6]	train-rmse:1.53401	test-rmse:1.53401
[7]	train-rmse:1.40018	test-rmse:1.40018
[8]	train-rmse:1.29658	test-rmse:1.29658
[9]	train-rmse:1.21578	test-rmse:1.21578
