In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys

from sklearn.preprocessing import LabelEncoder
import scipy.stats as ss

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb

%matplotlib inline

In [2]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

In [4]:
p_df = train.pivot_table(index=['shop_id','item_id'], 
                            columns='date_block_num', 
                            values='item_cnt_day',
                            aggfunc='sum').fillna(0.0)

In [5]:
p_df.head()

Unnamed: 0_level_0,date_block_num,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
shop_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,30,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,31,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,32,6.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,33,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,35,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
p_df = p_df.reset_index()

In [8]:
item.head()

Unnamed: 0,item_id,item_name_translated,item_cat1,item_cat2
0,5441,PC: Headset HyperX Cloud Core gaming stereo (K...,PC,Headsets / Headphones
1,16255,Headphones PHILIPS SBC HC8680,PC,Headsets / Headphones
2,16256,Headphones RITMIX RH-120,PC,Headsets / Headphones
3,16257,Headphones RITMIX RH-124 Black,PC,Headsets / Headphones
4,5606,PS2: Memory Card 8 MB Black (Memory Card 8Mb -...,Accessories,PS2


In [10]:
p_df = p_df.merge(item, how='inner', on='item_id')

In [11]:
p_df.drop(['item_name_translated'], axis=1, inplace=True)

In [14]:
le = LabelEncoder()
p_df['item_cat1'] = le.fit_transform(p_df['item_cat1'])
p_df['item_cat2'] = p_df['item_cat2'].astype(str)
p_df['item_cat2'] = le.fit_transform(p_df['item_cat2'])

In [17]:
import xgboost as xgb
param = {'max_depth':10, 
         'subsample':1,
         'min_child_weight':0.5,
         'eta':0.3, 
         'num_round':1000, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse'}

progress = dict()
xgbtrain = xgb.DMatrix(p_df.iloc[:,  (p_df.columns != 33)].values, p_df.iloc[:, p_df.columns == 33].values)
watchlist  = [(xgbtrain,'train-rmse')]

bst = xgb.train(param, xgbtrain)
preds = bst.predict(xgb.DMatrix(p_df.iloc[:,  (p_df.columns != 33)].values))
from sklearn.metrics import mean_squared_error 
rmse = np.sqrt(mean_squared_error(preds,p_df.iloc[:, p_df.columns == 33].values))
print(rmse)

1.2652050841170301


In [20]:
t_df = test.merge(p_df, how='left', on=['shop_id', 'item_id']).fillna(0.0)

cols = t_df.columns

cols = cols.insert(3, cols[-2])
cols = cols.insert(4, cols[-1])

cols = cols[:-2]

In [35]:
t_df = t_df[cols]

In [37]:
t_df.head()

Unnamed: 0,ID,shop_id,item_id,item_cat1,item_cat2,0,1,2,3,4,...,24,25,26,27,28,29,30,31,32,33
0,0,5,5037,9.0,42.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,1,5,5320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,5,5233,9.0,42.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,3,5,5232,9.0,58.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,5,5268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
d = dict(zip(t_df.columns[5:],list(np.array(list(t_df.columns[5:])) - 1)))

In [43]:
t_df = t_df.rename(d, axis=1)

In [46]:
preds = bst.predict(xgb.DMatrix(t_df.iloc[:, (t_df.columns != 'ID') & (t_df.columns != -1)].values))

In [48]:
preds = list(map(lambda x: min(20,max(x,0)), list(preds)))
sub_df = pd.DataFrame({'ID':t_df.ID,'item_cnt_month': preds })
sub_df.describe()

Unnamed: 0,ID,item_cnt_month
count,214200.0,214200.0
mean,107099.5,1.346495
std,61834.358168,1.01497
min,0.0,0.0
25%,53549.75,0.848627
50%,107099.5,1.052436
75%,160649.25,2.146009
max,214199.0,20.0


In [49]:
sub_df.to_csv('xg_boost4_cats.csv',index=False)