In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time
import gc
import itertools
import tqdm
import time
import copy

import scipy.stats as ss
from numba import jit

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

%matplotlib inline

### Helper Function

In [2]:
def lag_features(df, lags, col):
    '''create lag features of col'''
    cols_idx = ['date_block_num', 'shop_id', 'item_id']
    cols_f = ['date_block_num', 'shop_id', 'item_id', col]
    tmp = df.loc[:, cols_f]
    for i in lags:
        shifted = tmp.copy()
        shifted_cols = shifted.columns.tolist()
        shifted_cols[-1] = col+'_lag_'+str(i)
        shifted.columns = shifted_cols
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=cols_idx, how='left')
    return df    

In [3]:
def clip_rmse(preds, dtrain):
    y_test = np.array(dtrain.get_label())
    preds = np.array(preds)
    y_test = np.maximum(np.minimum(y_test, 20), 0)
    preds = np.maximum(np.minimum(preds, 20), 0)
    #preds = np.array(list(map(lambda x: min(20, max(x, 0)), list(preds))))
    #y_test = np.array(list(map(lambda x: min(20, max(x, 0)), list(y_test))))
    rmse = np.sqrt(mean_squared_error(preds,y_test))
    return 'clip-rmse', rmse

In [4]:
def cv(x_train, y_train, param, n_repetition, n_split, n_tree, verbose, random_state):
    '''Repeated CV'''
    
    cv_results = {}
    clf = {}
    running_time = {}

    np.random.seed(random_state)

    for m in range(n_repetition):
        # Train and valuation sets split
        skf = StratifiedKFold(n_splits=n_split, random_state=np.random.randint(10**6), shuffle=True)

        for n, (train_index, val_index) in enumerate(skf.split(x_train, y_train)):

            start_time = time.time()

            # Construct DMatrix
            dtrain = xgb.DMatrix(x_train.iloc[train_index], label=y_train.iloc[train_index])
            dval = xgb.DMatrix(x_train.iloc[val_index], label=y_train.iloc[val_index])

            # Placeholder for evals_results
            cv_results[m, n] = {}

            param['seed'] = np.random.randint(10**6)
            clf[m, n] = xgb.train(param, dtrain,num_boost_round=n_tree, 
                                  evals=[(dtrain, 'train'), (dval, 'val')], 
                                  feval=clip_rmse, maximize=False, early_stopping_rounds=None, 
                                  evals_result=cv_results[m, n], verbose_eval=verbose)

            running_time[m, n] = time.time() - start_time

            print('Repeat {}, split {}, val score = {:.3f}, running time = {:.3f} min.'.format(m, n, 
                cv_results[m, n]['val']['clip-rmse'][-1], running_time[m, n]/60))

    cv_results_final = {}
    for m in range(n_repetition):
        for n in range(n_split):
            cv_results_final['train', m, n] = cv_results[m, n]['train']['clip-rmse']
            cv_results_final['val', m, n] = cv_results[m, n]['val']['clip-rmse']

    df = pd.DataFrame(cv_results_final)
    df.index.name = 'iteration'
    df.columns.names = ['dataset', 'repetition', 'cv_split']

    print('Val mean = {:.3f}, std = {:.3f}'.format(df['val'].iloc[-1].mean(), df['val'].iloc[-1].std()))
    
    return df, clf, running_time

In [5]:
def feature_combination(x, feature_list, function_dict, column_name, merge=False):
    '''Combination of new features'''
    tmp = x.groupby(feature_list).agg(function_dict)
    tmp.columns = column_name
    if merge:
        x = x.merge(tmp, on=feature_list, how='left')
        return x, tmp
    else:
        return tmp

### Load Data

In [6]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

test.set_index('ID', inplace=True)
item.drop(['item_name_translated'], axis=1, inplace=True)
shop.drop(['Name'], axis=1, inplace=True)

le = LabelEncoder()
item['item_cat1'] = le.fit_transform(item['item_cat1'].astype(str))
item['item_cat2'] = le.fit_transform(item['item_cat2'].astype(str))
shop['City'] = le.fit_transform(shop['City'])
shop['Type'] = le.fit_transform(shop['Type'])

In [7]:
# Add date_block_num to test 
test['date_block_num'] = 34
test = test[['date_block_num', 'shop_id', 'item_id']]

In [8]:
train = train.merge(shop, on='shop_id', how='left')
train = train.merge(item, on='item_id', how='left')
train['revenue'] = train.item_price * train.item_cnt_day

### Experiments

In [9]:
x = train.groupby(['date_block_num', 
                   'shop_id', 
                   'item_id']).agg({
    'item_price':[np.mean, np.std], 'item_cnt_day': np.sum, 'revenue': np.sum})
# x.columns = ['price_mean_month', 'cnt_sum_month']

#cols = ['price_mean_month', 'price_median_month', 'price_std_month', 'cnt_sum_month']
cols = ['price_mean_month', 'price_std_month', 'cnt_sum_month', 'rev_sum_month']
x.columns = cols

x.reset_index(inplace=True)
x = pd.concat((x, test), sort=False, ignore_index=True)
x.reset_index(drop=True, inplace=True)

x = pd.merge(x, shop, on='shop_id', how='left')
x = pd.merge(x, item, on='item_id', how='left')

In [10]:
tmp = x.columns.tolist()
tmp[3:3] = ['City', 'Type', 'item_cat1', 'item_cat2']
tmp = tmp[:11]
x = x[tmp]

In [11]:
a = pd.read_csv('eda_11_5_feature_importance.csv', usecols=[0, 1])
a.columns = ['feature_name', 'score']
a.sort_values(by='score', ascending=False, inplace=True)

#### Lag of `['price_mean_month', 'price_std_month', 'cnt_sum_month', 'rev_sum_month']`

In [12]:
for c in tqdm.tqdm_notebook(cols):
    x = lag_features(x, range(1, 7), c)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




#### Create combinations of features

##### `(item_id, Type)`

- count of shops with the same type and selling the same item, i.e., having the same `(item_id, Type)`
- average sales count among these shops

In [13]:
for i in range(len(a)):
    if 'item_type' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
        break
# (item, type) is not good

43 rev_mean_item_type_lag_1 205.1912295064547


In [14]:
feature_list = ['date_block_num', 'item_id', 'Type']
function_list = {'rev_sum_month':np.mean}
column_name = ['rev_mean_item_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [15]:
for c in column_name:
    x = lag_features(x, [1], c)

##### `(item_id, City)`

In [16]:
for i in range(len(a)):
    if 'item_city' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (item, city) keeps cnt_mean, rev_mean, price_mean, shop_count
# lag = 1, 2, 5

15 price_mean_item_city_lag_1 407.2581965373557
17 cnt_mean_item_city_lag_1 402.0346140541581
32 rev_mean_item_city_lag_1 247.7419952194
42 cnt_mean_item_city_lag_2 207.6280725768317


In [17]:
feature_list = ['date_block_num', 'item_id', 'City']
function_list = {'cnt_sum_month':'mean', 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['cnt_mean_item_city', 'price_mean_item_city', 'rev_mean_item_city']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [18]:
for c in column_name:
    x = lag_features(x, [1, 2], c)

##### `(cat1, shop)`

In [19]:
for i in range(len(a)):
    if 'cat1_shop' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (cat1, shop) keeps cnt_mean
# lag = 1

28 cnt_mean_cat1_shop_lag_1 264.6864350965387


In [20]:
feature_list = ['date_block_num', 'item_cat1', 'shop_id']
function_list = {'cnt_sum_month':'mean'}
column_name = ['cnt_mean_cat1_shop']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [21]:
for c in column_name:
    x = lag_features(x, [1], c)

##### `(cat1, City)`

In [22]:
for i in range(len(a)):
    if 'cat1_city' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (cat1, city) keeps rev_mean
# lag = 2

39 rev_mean_cat1_city_lag_1 224.80017184708407


In [23]:
feature_list = ['date_block_num', 'item_cat1', 'City']
function_list = {'rev_sum_month':np.mean}
column_name = ['rev_mean_cat1_city']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [24]:
for c in column_name:
    x = lag_features(x, [1, 2], c)

##### `(cat1, Type)`

In [25]:
for i in range(len(a)):
    if 'cat1_type' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (cat1, type) keeps cnt_mean, rev_mean, price_mean, shop_count
# lag = 1, 2, 3

23 price_mean_cat1_type_lag_2 318.85016734611116
24 rev_mean_cat1_type_lag_1 305.90656485440684
31 rev_mean_cat1_type_lag_3 249.9310523411764
36 price_mean_cat1_type_lag_1 232.9496278383077
37 cnt_mean_cat1_type_lag_1 225.8227948488464
47 cnt_mean_cat1_type_lag_2 200.1026545284617


In [26]:
feature_list = ['date_block_num', 'item_cat1', 'Type']
function_list = {'cnt_sum_month':'mean', 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['cnt_mean_cat1_type', 'price_mean_cat1_type', 'rev_mean_cat1_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [27]:
for c in column_name:
    x = lag_features(x, [1, 2, 3], c)

##### `(cat2, shop)`

In [29]:
for i in range(len(a)):
    if 'cat2_shop' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (cat2, shop) is not good

In [30]:
#feature_list = ['date_block_num', 'item_cat2', 'shop_id']
#function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
#column_name = ['shop_count_cat2_shop', 'cnt_mean_cat2_shop', 'price_mean_cat2_shop', 'rev_mean_cat2_shop']
#cols.extend(column_name)
#x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat2, city)`

In [31]:
for i in range(len(a)):
    if 'cat2_city' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (cat2, city) keeps shop_count
# lag = 8

In [32]:
#feature_list = ['date_block_num', 'item_cat2', 'City']
#function_list = {'cnt_sum_month':'count'}
#column_name = ['shop_count_cat2_city']
#cols.extend(column_name)
#x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [33]:
#for c in column_name:
#    x = lag_features(x, [1, 2, 3, 4, 5, 6, 7, 8], c)

##### `(cat2, type)`

In [34]:
for i in range(len(a)):
    if 'cat2_type' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break

21 cnt_mean_cat2_type_lag_2 336.4159801201851
33 rev_mean_cat2_type_lag_1 237.25313811365382
40 cnt_mean_cat2_type_lag_1 223.8460154976001
51 price_mean_cat2_type_lag_3 186.54455810311364


In [35]:
feature_list = ['date_block_num', 'item_cat2', 'Type']
function_list = {'cnt_sum_month':'mean', 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['cnt_mean_cat2_type', 'price_mean_cat2_type', 'rev_mean_cat2_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [36]:
for c in column_name:
    x = lag_features(x, [1, 2], c)

##### `(cat1, cat2)`

In [37]:
for i in range(len(a)):
    if 'cat1_cat2' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break

0 cnt_mean_cat1_cat2_lag_2 10181.935589181076
2 cnt_mean_cat1_cat2_lag_1 1255.6685222378576
7 shop_count_cat1_cat2_lag_1 690.0411360021566
12 price_mean_cat1_cat2_lag_2 437.05246716000016
13 shop_count_cat1_cat2_lag_3 432.8327789164
18 cnt_mean_cat1_cat2_lag_3 393.79537253769234
19 price_mean_cat1_cat2_lag_5 368.8642840515625
20 price_mean_cat1_cat2_lag_3 362.13488607333335
25 shop_count_cat1_cat2_lag_2 288.4192905980952
29 price_mean_cat1_cat2_lag_1 261.4393863478182
30 rev_mean_cat1_cat2_lag_1 257.97522428891307
34 rev_mean_cat1_cat2_lag_2 236.4968992340541
38 rev_mean_cat1_cat2_lag_5 224.871415595862
44 shop_count_cat1_cat2_lag_4 205.12218922341373
45 shop_count_cat1_cat2_lag_6 203.25753134102558
46 rev_mean_cat1_cat2_lag_3 200.7767644164
48 rev_mean_cat1_cat2_lag_4 192.63357779615384


In [38]:
feature_list = ['date_block_num', 'item_cat1', 'item_cat2']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_cat2', 'cnt_mean_cat1_cat2', 'price_mean_cat1_cat2', 'rev_mean_cat1_cat2']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [39]:
for c in column_name:
    x = lag_features(x, [1, 2, 3, 4, 5, 6], c)

##### `(city, type)`

In [40]:
for i in range(len(a)):
    if 'city_type' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (city, type) is not good

In [41]:
#feature_list = ['date_block_num', 'City', 'Type']
#function_list = {'price_mean_month':np.mean, 'rev_sum_month':np.mean}
#column_name = ['price_mean_city_type', 'rev_mean_city_type']
#cols.extend(column_name)
#x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [42]:
#for c in column_name:
#    x = lag_features(x, [1, 5, 12], c)

Remove value for the current month, since for test data we do not have this information

In [45]:
cols.remove('cnt_sum_month')
x.drop(cols, axis=1, inplace=True)

#### CV

In [47]:
# Split train and test sets
x_train = x.loc[(x['date_block_num']<=33) & (x['date_block_num']>=6), :].copy()
x_test = x.loc[x['date_block_num']==34, :].copy()

# Drop target from test set
x_test.drop(['cnt_sum_month'], axis=1, inplace=True)

# Split target from train set
# Note that target is first clipped to (0, 40), then clipped to (0, 20) in test set. 
# This is similar to the idea of calibration
y_train = x_train['cnt_sum_month'].clip(0, 40)
x_train.drop(['cnt_sum_month'], axis=1, inplace=True)

In [None]:
param = {'max_depth':8, 
         'subsample':0.8,
         'min_child_weight':5,
         'eta':0.3, 
         'lambda':2,
         'colsample_bytree':0.8,
         'seed':1,
         'silent':1,
         'maximize': False,
         'nthread':8}

n_tree = 80
verbose = True

n_split = 3
n_repetition = 1

df, clf, running_time = cv(x_train, y_train, param, n_repetition, n_split, n_tree, verbose, 42)

[0]	train-rmse:3.23992	val-rmse:3.24644	train-clip-rmse:2.51909	val-clip-rmse:2.52481
[1]	train-rmse:2.93021	val-rmse:2.94515	train-clip-rmse:2.24786	val-clip-rmse:2.26091
[2]	train-rmse:2.75342	val-rmse:2.7743	train-clip-rmse:2.11382	val-clip-rmse:2.13217
[3]	train-rmse:2.65281	val-rmse:2.68006	train-clip-rmse:2.04484	val-clip-rmse:2.06842
[4]	train-rmse:2.59031	val-rmse:2.62431	train-clip-rmse:2.00617	val-clip-rmse:2.03512
[5]	train-rmse:2.53172	val-rmse:2.57003	train-clip-rmse:1.97232	val-clip-rmse:2.00435
[6]	train-rmse:2.50331	val-rmse:2.54829	train-clip-rmse:1.95721	val-clip-rmse:1.99397
[7]	train-rmse:2.47266	val-rmse:2.52135	train-clip-rmse:1.93847	val-clip-rmse:1.97722
[8]	train-rmse:2.44599	val-rmse:2.50118	train-clip-rmse:1.92166	val-clip-rmse:1.96565
[9]	train-rmse:2.43422	val-rmse:2.49283	train-clip-rmse:1.91553	val-clip-rmse:1.96167
[10]	train-rmse:2.40943	val-rmse:2.47464	train-clip-rmse:1.89917	val-clip-rmse:1.94852
[11]	train-rmse:2.39776	val-rmse:2.46657	train-clip-rm

[15]	train-rmse:2.35651	val-rmse:2.44703	train-clip-rmse:1.86323	val-clip-rmse:1.91339
[16]	train-rmse:2.32789	val-rmse:2.42216	train-clip-rmse:1.84247	val-clip-rmse:1.89543
[17]	train-rmse:2.31892	val-rmse:2.41596	train-clip-rmse:1.83639	val-clip-rmse:1.89162
[18]	train-rmse:2.29552	val-rmse:2.39595	train-clip-rmse:1.82074	val-clip-rmse:1.87824
[19]	train-rmse:2.28877	val-rmse:2.39191	train-clip-rmse:1.81618	val-clip-rmse:1.87574
[20]	train-rmse:2.28133	val-rmse:2.38776	train-clip-rmse:1.80996	val-clip-rmse:1.87167
[21]	train-rmse:2.27341	val-rmse:2.38195	train-clip-rmse:1.8045	val-clip-rmse:1.86777
[22]	train-rmse:2.25816	val-rmse:2.36953	train-clip-rmse:1.79419	val-clip-rmse:1.86
[23]	train-rmse:2.23839	val-rmse:2.35522	train-clip-rmse:1.78073	val-clip-rmse:1.84995
[24]	train-rmse:2.23665	val-rmse:2.35403	train-clip-rmse:1.77941	val-clip-rmse:1.8493
[25]	train-rmse:2.23334	val-rmse:2.35252	train-clip-rmse:1.77705	val-clip-rmse:1.84803
[26]	train-rmse:2.22943	val-rmse:2.34953	train-c

In [55]:
a = {k: clf[0, 0].get_score(importance_type='gain') for k in clf.keys()}

In [56]:
b = pd.DataFrame(a)

In [57]:
b.columns = list(range(n_split))

In [58]:
c = b.mean(axis=1).sort_values(ascending=False)

In [59]:
d = c.head(50).index.tolist()
e = []
f = []
for n in d:
    if len(n.split('_'))>=2 and n.split('_')[-2] == 'lag':
        e.append('_'.join(n.split('_')[:-2]))
        f.append(int(n.split('_')[-1]))
        
e = set(e)
f = set(f)

In [60]:
e

{'cnt_mean_cat1_cat2',
 'cnt_mean_cat1_shop',
 'cnt_mean_cat1_type',
 'cnt_mean_cat2_type',
 'cnt_mean_item_city',
 'cnt_sum_month',
 'price_mean_cat1_cat2',
 'price_mean_cat1_type',
 'price_mean_item_city',
 'price_mean_month',
 'rev_mean_cat1_cat2',
 'rev_mean_cat1_city',
 'rev_mean_cat1_type',
 'rev_mean_cat2_type',
 'rev_mean_item_city',
 'rev_mean_item_type',
 'rev_sum_month',
 'shop_count_cat1_cat2'}

In [61]:
f

{1, 2, 3, 4, 5, 6}

In [62]:
26*10

260

In [64]:
b.to_csv('eda_11_5_feature_importance.csv')