In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time
import gc
import itertools
import tqdm
import time
import copy

import scipy.stats as ss
from numba import jit

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

%matplotlib inline

### Helper Function

In [2]:
def lag_features(df, lags, col):
    '''create lag features of col'''
    cols_idx = ['date_block_num', 'shop_id', 'item_id']
    cols_f = ['date_block_num', 'shop_id', 'item_id', col]
    tmp = df.loc[:, cols_f]
    for i in lags:
        shifted = tmp.copy()
        shifted_cols = shifted.columns.tolist()
        shifted_cols[-1] = col+'_lag_'+str(i)
        shifted.columns = shifted_cols
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=cols_idx, how='left')
    return df    

In [3]:
def clip_rmse(preds, dtrain):
    y_test = np.array(dtrain.get_label())
    preds = np.array(preds)
    y_test = np.maximum(np.minimum(y_test, 20), 0)
    preds = np.maximum(np.minimum(preds, 20), 0)
    #preds = np.array(list(map(lambda x: min(20, max(x, 0)), list(preds))))
    #y_test = np.array(list(map(lambda x: min(20, max(x, 0)), list(y_test))))
    rmse = np.sqrt(mean_squared_error(preds,y_test))
    return 'clip-rmse', rmse

In [4]:
def cv(x_train, y_train, param, n_repetition, n_split, n_tree, verbose, random_state):
    '''Repeated CV'''
    
    cv_results = {}
    clf = {}
    running_time = {}

    np.random.seed(random_state)

    for m in range(n_repetition):
        # Train and valuation sets split
        skf = StratifiedKFold(n_splits=n_split, random_state=np.random.randint(10**6), shuffle=True)

        for n, (train_index, val_index) in enumerate(skf.split(x_train, y_train)):

            start_time = time.time()

            # Construct DMatrix
            dtrain = xgb.DMatrix(x_train.iloc[train_index], label=y_train.iloc[train_index])
            dval = xgb.DMatrix(x_train.iloc[val_index], label=y_train.iloc[val_index])

            # Placeholder for evals_results
            cv_results[m, n] = {}

            param['seed'] = np.random.randint(10**6)
            clf[m, n] = xgb.train(param, dtrain,num_boost_round=n_tree, 
                                  evals=[(dtrain, 'train'), (dval, 'val')], 
                                  feval=clip_rmse, maximize=False, early_stopping_rounds=None, 
                                  evals_result=cv_results[m, n], verbose_eval=verbose)

            running_time[m, n] = time.time() - start_time

            print('Repeat {}, split {}, val score = {:.3f}, running time = {:.3f} min.'.format(m, n, 
                cv_results[m, n]['val']['clip-rmse'][-1], running_time[m, n]/60))

    cv_results_final = {}
    for m in range(n_repetition):
        for n in range(n_split):
            cv_results_final['train', m, n] = cv_results[m, n]['train']['clip-rmse']
            cv_results_final['val', m, n] = cv_results[m, n]['val']['clip-rmse']

    df = pd.DataFrame(cv_results_final)
    df.index.name = 'iteration'
    df.columns.names = ['dataset', 'repetition', 'cv_split']

    print('Val mean = {:.3f}, std = {:.3f}'.format(df['val'].iloc[-1].mean(), df['val'].iloc[-1].std()))
    
    return df, clf, running_time

In [5]:
def feature_combination(x, feature_list, function_dict, column_name, merge=False):
    '''Combination of new features'''
    tmp = x.groupby(feature_list).agg(function_dict)
    tmp.columns = column_name
    if merge:
        x = x.merge(tmp, on=feature_list, how='left')
        return x, tmp
    else:
        return tmp

### Load Data

In [6]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

test.set_index('ID', inplace=True)
item.drop(['item_name_translated'], axis=1, inplace=True)
shop.drop(['Name'], axis=1, inplace=True)

le = LabelEncoder()
item['item_cat1'] = le.fit_transform(item['item_cat1'].astype(str))
item['item_cat2'] = le.fit_transform(item['item_cat2'].astype(str))
shop['City'] = le.fit_transform(shop['City'])
shop['Type'] = le.fit_transform(shop['Type'])

In [7]:
# Add date_block_num to test 
test['date_block_num'] = 34
test = test[['date_block_num', 'shop_id', 'item_id']]

In [8]:
train = train.merge(shop, on='shop_id', how='left')
train = train.merge(item, on='item_id', how='left')
train['revenue'] = train.item_price * train.item_cnt_day

### Experiments

In [9]:
x = train.groupby(['date_block_num', 
                   'shop_id', 
                   'item_id']).agg({
    'item_price':[np.mean, np.std], 'item_cnt_day': np.sum, 'revenue': np.sum})
# x.columns = ['price_mean_month', 'cnt_sum_month']

#cols = ['price_mean_month', 'price_median_month', 'price_std_month', 'cnt_sum_month']
cols = ['price_mean_month', 'price_std_month', 'cnt_sum_month', 'rev_sum_month']
x.columns = cols

x.reset_index(inplace=True)
x = pd.concat((x, test), sort=False, ignore_index=True)
x.reset_index(drop=True, inplace=True)

x = pd.merge(x, shop, on='shop_id', how='left')
x = pd.merge(x, item, on='item_id', how='left')

In [10]:
tmp = x.columns.tolist()
tmp[3:3] = ['City', 'Type', 'item_cat1', 'item_cat2']
tmp = tmp[:11]
x = x[tmp]

#### Create combinations of features

##### `(item_id, Type)`

- count of shops with the same type and selling the same item, i.e., having the same `(item_id, Type)`
- average sales count among these shops

In [11]:
feature_list = ['date_block_num', 'item_id', 'Type']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_item_type', 'cnt_mean_item_type', 'price_mean_item_type', 'rev_mean_item_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(item_id, City)`

In [12]:
feature_list = ['date_block_num', 'item_id', 'City']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_item_city', 'cnt_mean_item_city', 'price_mean_item_city', 'rev_mean_item_city']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat1, shop)`

In [13]:
feature_list = ['date_block_num', 'item_cat1', 'shop_id']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_shop', 'cnt_mean_cat1_shop', 'price_mean_cat1_shop', 'rev_mean_cat1_shop']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat1, City)`

In [14]:
feature_list = ['date_block_num', 'item_cat1', 'City']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_city', 'cnt_mean_cat1_city', 'price_mean_cat1_city', 'rev_mean_cat1_city']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat1, Type)`

In [15]:
feature_list = ['date_block_num', 'item_cat1', 'Type']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_type', 'cnt_mean_cat1_type', 'price_mean_cat1_type', 'rev_mean_cat1_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat2, shop)`

In [16]:
feature_list = ['date_block_num', 'item_cat2', 'shop_id']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat2_shop', 'cnt_mean_cat2_shop', 'price_mean_cat2_shop', 'rev_mean_cat2_shop']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat2, city)`

In [17]:
feature_list = ['date_block_num', 'item_cat2', 'City']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat2_city', 'cnt_mean_cat2_city', 'price_mean_cat2_city', 'rev_mean_cat2_city']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat2, type)`

In [18]:
feature_list = ['date_block_num', 'item_cat2', 'Type']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat2_type', 'cnt_mean_cat2_type', 'price_mean_cat2_type', 'rev_mean_cat2_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat1, cat2)`

In [19]:
feature_list = ['date_block_num', 'item_cat1', 'item_cat2']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_cat2', 'cnt_mean_cat1_cat2', 'price_mean_cat1_cat2', 'rev_mean_cat1_cat2']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(city, type)`

In [20]:
feature_list = ['date_block_num', 'City', 'Type']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_city_type', 'cnt_mean_city_type', 'price_mean_city_type', 'rev_mean_city_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

#### Write a function for the analysis

In [21]:
x.tail()

Unnamed: 0,date_block_num,shop_id,item_id,City,Type,item_cat1,item_cat2,price_mean_month,price_std_month,cnt_sum_month,...,price_mean_cat2_type,rev_mean_cat2_type,shop_count_cat1_cat2,cnt_mean_cat1_cat2,price_mean_cat1_cat2,rev_mean_cat1_cat2,shop_count_city_type,cnt_mean_city_type,price_mean_city_type,rev_mean_city_type
1823319,34,45,18454,15,4,12,16,,,,...,,,0,,,,0,,,
1823320,34,45,16188,15,4,10,12,,,,...,,,0,,,,0,,,
1823321,34,45,15757,15,4,12,16,,,,...,,,0,,,,0,,,
1823322,34,45,19648,15,4,3,23,,,,...,,,0,,,,0,,,
1823323,34,45,969,15,4,3,9,,,,...,,,0,,,,0,,,


In [22]:
for c in tqdm.tqdm_notebook(cols):
    x = lag_features(x, range(1, 13), c)

HBox(children=(IntProgress(value=0, max=44), HTML(value='')))




In [23]:
cols.remove('cnt_sum_month')
x.drop(cols, axis=1, inplace=True)

#### CV

In [24]:
# Split train and test sets
x_train = x.loc[x['date_block_num']<=33, :].copy()
x_test = x.loc[x['date_block_num']==34, :].copy()

# Drop target from test set
x_test.drop(['cnt_sum_month'], axis=1, inplace=True)

# Split target from train set
# Note that target is first clipped to (0, 40), then clipped to (0, 20) in test set. 
# This is similar to the idea of calibration
y_train = x_train['cnt_sum_month'].clip(0, 40)
x_train.drop(['cnt_sum_month'], axis=1, inplace=True)

In [25]:
param = {'max_depth':14, 
         'subsample':0.9,
         'min_child_weight':3,
         'eta':0.3, 
         'lambda':2,
         'colsample_bytree':0.6,
         'seed':1,
         'silent':1,
         'maximize': False,
         'nthread':8}

n_tree = 50
verbose = True

n_split = 5
n_repetition = 1

df, clf, running_time = cv(x_train, y_train, param, n_repetition, n_split, n_tree, verbose, 42)

[0]	train-rmse:3.14841	val-rmse:3.1866	train-clip-rmse:2.461	val-clip-rmse:2.49269
[1]	train-rmse:2.83125	val-rmse:2.90803	train-clip-rmse:2.17921	val-clip-rmse:2.2425
[2]	train-rmse:2.61686	val-rmse:2.72753	train-clip-rmse:2.00932	val-clip-rmse:2.09866
[3]	train-rmse:2.46314	val-rmse:2.61139	train-clip-rmse:1.89512	val-clip-rmse:2.01231
[4]	train-rmse:2.35964	val-rmse:2.5339	train-clip-rmse:1.82412	val-clip-rmse:1.95994
[5]	train-rmse:2.28574	val-rmse:2.48223	train-clip-rmse:1.7736	val-clip-rmse:1.92602
[6]	train-rmse:2.23732	val-rmse:2.44804	train-clip-rmse:1.74322	val-clip-rmse:1.90647
[7]	train-rmse:2.19409	val-rmse:2.41671	train-clip-rmse:1.7145	val-clip-rmse:1.88684
[8]	train-rmse:2.16622	val-rmse:2.40069	train-clip-rmse:1.69561	val-clip-rmse:1.878
[9]	train-rmse:2.13751	val-rmse:2.38667	train-clip-rmse:1.67521	val-clip-rmse:1.87026
[10]	train-rmse:2.08259	val-rmse:2.34721	train-clip-rmse:1.63746	val-clip-rmse:1.8451
[11]	train-rmse:2.06102	val-rmse:2.33409	train-clip-rmse:1.6231

[45]	train-rmse:1.61427	val-rmse:2.08687	train-clip-rmse:1.30395	val-clip-rmse:1.69478
[46]	train-rmse:1.60872	val-rmse:2.08412	train-clip-rmse:1.29902	val-clip-rmse:1.69243
[47]	train-rmse:1.60526	val-rmse:2.08362	train-clip-rmse:1.29576	val-clip-rmse:1.69217
[48]	train-rmse:1.60007	val-rmse:2.08083	train-clip-rmse:1.29155	val-clip-rmse:1.69009
[49]	train-rmse:1.58921	val-rmse:2.07335	train-clip-rmse:1.28523	val-clip-rmse:1.6869
Repeat 0, split 1, val score = 1.687, running time = 7.051 min.
[0]	train-rmse:3.14513	val-rmse:3.20367	train-clip-rmse:2.45711	val-clip-rmse:2.51016
[1]	train-rmse:2.81037	val-rmse:2.91197	train-clip-rmse:2.15723	val-clip-rmse:2.24659
[2]	train-rmse:2.61453	val-rmse:2.75242	train-clip-rmse:2.00148	val-clip-rmse:2.11813
[3]	train-rmse:2.46957	val-rmse:2.64097	train-clip-rmse:1.89364	val-clip-rmse:2.03534
[4]	train-rmse:2.35481	val-rmse:2.55712	train-clip-rmse:1.81617	val-clip-rmse:1.9792
[5]	train-rmse:2.29469	val-rmse:2.51811	train-clip-rmse:1.77659	val-clip-

[39]	train-rmse:1.67878	val-rmse:2.12507	train-clip-rmse:1.35153	val-clip-rmse:1.71946
[40]	train-rmse:1.67319	val-rmse:2.12373	train-clip-rmse:1.34621	val-clip-rmse:1.71877
[41]	train-rmse:1.67029	val-rmse:2.1232	train-clip-rmse:1.34337	val-clip-rmse:1.71853
[42]	train-rmse:1.66545	val-rmse:2.12273	train-clip-rmse:1.3391	val-clip-rmse:1.71875
[43]	train-rmse:1.64955	val-rmse:2.11336	train-clip-rmse:1.32997	val-clip-rmse:1.71361
[44]	train-rmse:1.64785	val-rmse:2.11289	train-clip-rmse:1.32829	val-clip-rmse:1.71339
[45]	train-rmse:1.62947	val-rmse:2.1032	train-clip-rmse:1.31269	val-clip-rmse:1.70535
[46]	train-rmse:1.62492	val-rmse:2.10241	train-clip-rmse:1.30852	val-clip-rmse:1.70503
[47]	train-rmse:1.62314	val-rmse:2.10241	train-clip-rmse:1.30683	val-clip-rmse:1.70519
[48]	train-rmse:1.62102	val-rmse:2.10228	train-clip-rmse:1.30445	val-clip-rmse:1.70505
[49]	train-rmse:1.61914	val-rmse:2.10205	train-clip-rmse:1.30283	val-clip-rmse:1.70499
Repeat 0, split 3, val score = 1.705, running 

In [35]:
a = {k: clf[0, 0].get_score(importance_type='gain') for k in clf.keys()}

In [37]:
b = pd.DataFrame(a)

In [39]:
b.columns = list(range(n_split))

In [43]:
c = b.mean(axis=1).sort_values(ascending=False)

In [60]:
d = c.head(50).index.tolist()
e = []
f = []
for n in d:
    if len(n.split('_'))>=2 and n.split('_')[-2] == 'lag':
        e.append('_'.join(n.split('_')[:-2]))
        
e = set(e)

In [61]:
e

{'cnt_mean_cat1_cat2',
 'cnt_mean_cat1_shop',
 'cnt_mean_cat1_type',
 'cnt_mean_cat2_type',
 'cnt_mean_item_city',
 'cnt_mean_item_type',
 'cnt_sum_month',
 'price_mean_cat1_cat2',
 'price_mean_cat1_type',
 'price_mean_cat2_type',
 'price_mean_city_type',
 'price_mean_item_city',
 'price_mean_item_type',
 'price_mean_month',
 'price_std_month',
 'rev_mean_cat1_cat2',
 'rev_mean_cat1_city',
 'rev_mean_cat1_type',
 'rev_mean_cat2_type',
 'rev_mean_city_type',
 'rev_mean_item_city',
 'rev_mean_item_type',
 'shop_count_cat1_cat2',
 'shop_count_cat1_type',
 'shop_count_cat2_city',
 'shop_count_item_city'}