In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time
import gc
import itertools
import tqdm
import time
import copy

import scipy.stats as ss
from numba import jit

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

%matplotlib inline

### Helper Function

In [2]:
def lag_features(df, lags, col):
    '''create lag features of col'''
    cols_idx = ['date_block_num', 'shop_id', 'item_id']
    cols_f = ['date_block_num', 'shop_id', 'item_id', col]
    tmp = df.loc[:, cols_f]
    for i in lags:
        shifted = tmp.copy()
        shifted_cols = shifted.columns.tolist()
        shifted_cols[-1] = col+'_lag_'+str(i)
        shifted.columns = shifted_cols
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=cols_idx, how='left')
    return df    

In [3]:
def clip_rmse(preds, dtrain):
    y_test = np.array(dtrain.get_label())
    preds = np.array(preds)
    y_test = np.maximum(np.minimum(y_test, 20), 0)
    preds = np.maximum(np.minimum(preds, 20), 0)
    #preds = np.array(list(map(lambda x: min(20, max(x, 0)), list(preds))))
    #y_test = np.array(list(map(lambda x: min(20, max(x, 0)), list(y_test))))
    rmse = np.sqrt(mean_squared_error(preds,y_test))
    return 'clip-rmse', rmse

In [4]:
def cv(param, n_repetition, n_split, n_tree, verbose, random_state):
    '''Repeated CV'''
    
    cv_results = {}
    clf = {}
    running_time = {}

    np.random.seed(random_state)

    for m in range(n_repetition):
        # Train and valuation sets split
        skf = StratifiedKFold(n_splits=n_split, random_state=np.random.randint(10**6), shuffle=True)

        for n, (train_index, val_index) in enumerate(skf.split(x_train, y_train)):

            start_time = time.time()

            # Construct DMatrix
            dtrain = xgb.DMatrix(x_train.iloc[train_index], label=y_train.iloc[train_index])
            dval = xgb.DMatrix(x_train.iloc[val_index], label=y_train.iloc[val_index])

            # Placeholder for evals_results
            cv_results[m, n] = {}

            param['seed'] = np.random.randint(10**6)
            clf[m, n] = xgb.train(param, dtrain,num_boost_round=n_tree, 
                                  evals=[(dtrain, 'train'), (dval, 'val')], 
                                  feval=clip_rmse, maximize=False, early_stopping_rounds=None, 
                                  evals_result=cv_results[m, n], verbose_eval=verbose)

            running_time[m, n] = time.time() - start_time

            print('Repeat {}, split {}, val score = {:.3f}, running time = {:.3f} min.'.format(m, n, 
                cv_results[m, n]['val']['clip-rmse'][-1], running_time[m, n]/60))

    cv_results_final = {}
    for m in range(n_repetition):
        for n in range(n_split):
            cv_results_final['train', m, n] = cv_results[m, n]['train']['clip-rmse']
            cv_results_final['val', m, n] = cv_results[m, n]['val']['clip-rmse']

    df = pd.DataFrame(cv_results_final)
    df.index.name = 'iteration'
    df.columns.names = ['dataset', 'repetition', 'cv_split']

    print('Val mean = {:.3f}, std = {:.3f}'.format(df['val'].iloc[-1].mean(), df['val'].iloc[-1].std()))
    
    return df, clf, running_time

In [5]:
def feature_combination(x, feature_list, function_dict, column_name, merge=False):
    '''Combination of new features'''
    tmp = x.groupby(feature_list).agg(function_dict)
    tmp.columns = column_name
    if merge:
        x = x.merge(tmp, on=feature_list, how='left')
        return x, tmp
    else:
        return tmp

### Load Data

In [6]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

test.set_index('ID', inplace=True)
item.drop(['item_name_translated'], axis=1, inplace=True)
shop.drop(['Name'], axis=1, inplace=True)

le = LabelEncoder()
item['item_cat1'] = le.fit_transform(item['item_cat1'].astype(str))
item['item_cat2'] = le.fit_transform(item['item_cat2'].astype(str))
shop['City'] = le.fit_transform(shop['City'])
shop['Type'] = le.fit_transform(shop['Type'])

In [7]:
# Add date_block_num to test 
test['date_block_num'] = 34
test = test[['date_block_num', 'shop_id', 'item_id']]

In [8]:
train = train.merge(shop, on='shop_id', how='left')
train = train.merge(item, on='item_id', how='left')
train['revenue'] = train.item_price * train.item_cnt_day

### Experiments

In [9]:
x = train.groupby(['date_block_num', 
                   'shop_id', 
                   'item_id']).agg({
    'item_price':[np.mean, np.std], 'item_cnt_day': np.sum, 'revenue': np.sum})
# x.columns = ['price_mean_month', 'cnt_sum_month']

#cols = ['price_mean_month', 'price_median_month', 'price_std_month', 'cnt_sum_month']
cols = ['price_mean_month', 'price_std_month', 'cnt_sum_month', 'rev_sum_month']
x.columns = cols

x.reset_index(inplace=True)
x = pd.concat((x, test), sort=False, ignore_index=True)
x.reset_index(drop=True, inplace=True)

x = pd.merge(x, shop, on='shop_id', how='left')
x = pd.merge(x, item, on='item_id', how='left')

In [10]:
tmp = x.columns.tolist()
tmp[3:3] = ['City', 'Type', 'item_cat1', 'item_cat2']
tmp = tmp[:11]
x = x[tmp]

#### Create combinations of features

##### `(item_id, Type)`

- count of shops with the same type and selling the same item, i.e., having the same `(item_id, Type)`
- average sales count among these shops

In [11]:
feature_list = ['date_block_num', 'item_id', 'Type']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_item_type', 'cnt_mean_item_type', 'price_mean_item_type', 'rev_mean_item_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(item_id, City)`

In [12]:
feature_list = ['date_block_num', 'item_id', 'City']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_item_city', 'cnt_mean_item_city', 'price_mean_item_city', 'rev_mean_item_city']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat1, shop)`

In [13]:
feature_list = ['date_block_num', 'item_cat1', 'shop_id']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_shop', 'cnt_mean_cat1_shop', 'price_mean_cat1_shop', 'rev_mean_cat1_shop']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat1, City)`

In [14]:
feature_list = ['date_block_num', 'item_cat1', 'City']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_city', 'cnt_mean_cat1_city', 'price_mean_cat1_city', 'rev_mean_cat1_city']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat1, Type)`

In [15]:
feature_list = ['date_block_num', 'item_cat1', 'Type']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_type', 'cnt_mean_cat1_type', 'price_mean_cat1_type', 'rev_mean_cat1_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat2, shop)`

In [16]:
feature_list = ['date_block_num', 'item_cat2', 'shop_id']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat2_shop', 'cnt_mean_cat2_shop', 'price_mean_cat2_shop', 'rev_mean_cat2_shop']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat2, city)`

In [17]:
feature_list = ['date_block_num', 'item_cat2', 'City']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat2_city', 'cnt_mean_cat2_city', 'price_mean_cat2_city', 'rev_mean_cat2_city']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat2, type)`

In [18]:
feature_list = ['date_block_num', 'item_cat2', 'Type']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat2_type', 'cnt_mean_cat2_type', 'price_mean_cat2_type', 'rev_mean_cat2_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat1, cat2)`

In [19]:
feature_list = ['date_block_num', 'item_cat1', 'item_cat2']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_cat2', 'cnt_mean_cat1_cat2', 'price_mean_cat1_cat2', 'rev_mean_cat1_cat2']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(city, type)`

In [20]:
feature_list = ['date_block_num', 'City', 'Type']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_city_type', 'cnt_mean_city_type', 'price_mean_city_type', 'rev_mean_city_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

#### Write a function for the analysis

In [21]:
x.tail()

Unnamed: 0,date_block_num,shop_id,item_id,City,Type,item_cat1,item_cat2,price_mean_month,price_std_month,cnt_sum_month,...,price_mean_cat2_type,rev_mean_cat2_type,shop_count_cat1_cat2,cnt_mean_cat1_cat2,price_mean_cat1_cat2,rev_mean_cat1_cat2,shop_count_city_type,cnt_mean_city_type,price_mean_city_type,rev_mean_city_type
1823319,34,45,18454,15,4,12,16,,,,...,,,0,,,,0,,,
1823320,34,45,16188,15,4,10,12,,,,...,,,0,,,,0,,,
1823321,34,45,15757,15,4,12,16,,,,...,,,0,,,,0,,,
1823322,34,45,19648,15,4,3,23,,,,...,,,0,,,,0,,,
1823323,34,45,969,15,4,3,9,,,,...,,,0,,,,0,,,


In [22]:
for c in tqdm.tqdm_notebook(cols):
    x = lag_features(x, range(1, 3), c)

HBox(children=(IntProgress(value=0, max=44), HTML(value='')))




In [23]:
cols.remove('cnt_sum_month')
x.drop(cols, axis=1, inplace=True)

#### CV

In [24]:
# Split train and test sets
x_train = x.loc[x['date_block_num']<=33, :].copy()
x_test = x.loc[x['date_block_num']==34, :].copy()

# Drop target from test set
x_test.drop(['cnt_sum_month'], axis=1, inplace=True)

# Split target from train set
# Note that target is first clipped to (0, 40), then clipped to (0, 20) in test set. 
# This is similar to the idea of calibration
y_train = x_train['cnt_sum_month'].clip(0, 40)
x_train.drop(['cnt_sum_month'], axis=1, inplace=True)

In [25]:
param = {'max_depth':9, 
         'subsample':0.9,
         'min_child_weight':3,
         'eta':0.03, 
         'lambda':2,
         'colsample_bytree':0.6,
         'seed':1,
         'silent':1,
         'maximize': False,
         'nthread':8}

n_tree = 100
verbose = True

n_split = 5
n_repetition = 2

df, clf, running_time = cv(param, n_repetition, n_split, n_tree, verbose, 42)

[0]	train-rmse:3.64565	val-rmse:3.65052	train-clip-rmse:2.94016	val-clip-rmse:2.94299
[1]	train-rmse:3.5955	val-rmse:3.60141	train-clip-rmse:2.88998	val-clip-rmse:2.89353
[2]	train-rmse:3.54715	val-rmse:3.55382	train-clip-rmse:2.84179	val-clip-rmse:2.8458
[3]	train-rmse:3.50142	val-rmse:3.5091	train-clip-rmse:2.79647	val-clip-rmse:2.80115
[4]	train-rmse:3.45679	val-rmse:3.46532	train-clip-rmse:2.75256	val-clip-rmse:2.75779
[5]	train-rmse:3.41398	val-rmse:3.42323	train-clip-rmse:2.71047	val-clip-rmse:2.71609
[6]	train-rmse:3.37305	val-rmse:3.38299	train-clip-rmse:2.67047	val-clip-rmse:2.67653
[7]	train-rmse:3.33276	val-rmse:3.34327	train-clip-rmse:2.63143	val-clip-rmse:2.63775
[8]	train-rmse:3.29491	val-rmse:3.30599	train-clip-rmse:2.59496	val-clip-rmse:2.60159
[9]	train-rmse:3.25937	val-rmse:3.27154	train-clip-rmse:2.56111	val-clip-rmse:2.5685
[10]	train-rmse:3.22543	val-rmse:3.23868	train-clip-rmse:2.52883	val-clip-rmse:2.53691
[11]	train-rmse:3.19253	val-rmse:3.2067	train-clip-rmse:2

[95]	train-rmse:2.45707	val-rmse:2.52638	train-clip-rmse:1.92734	val-clip-rmse:1.96661
[96]	train-rmse:2.45414	val-rmse:2.52384	train-clip-rmse:1.92536	val-clip-rmse:1.96482
[97]	train-rmse:2.45253	val-rmse:2.52262	train-clip-rmse:1.92431	val-clip-rmse:1.96401
[98]	train-rmse:2.45031	val-rmse:2.52077	train-clip-rmse:1.92291	val-clip-rmse:1.96282
[99]	train-rmse:2.44898	val-rmse:2.5198	train-clip-rmse:1.92206	val-clip-rmse:1.96221
Repeat 0, split 0, val score = 1.962, running time = 7.738 min.
[0]	train-rmse:3.64704	val-rmse:3.6492	train-clip-rmse:2.94131	val-clip-rmse:2.94274
[1]	train-rmse:3.59576	val-rmse:3.59783	train-clip-rmse:2.89002	val-clip-rmse:2.89127
[2]	train-rmse:3.54655	val-rmse:3.54892	train-clip-rmse:2.84105	val-clip-rmse:2.84244
[3]	train-rmse:3.4998	val-rmse:3.50197	train-clip-rmse:2.79453	val-clip-rmse:2.79574
[4]	train-rmse:3.45492	val-rmse:3.45754	train-clip-rmse:2.75021	val-clip-rmse:2.75174
[5]	train-rmse:3.4123	val-rmse:3.41507	train-clip-rmse:2.70831	val-clip-rm

[89]	train-rmse:2.4717	val-rmse:2.52121	train-clip-rmse:1.93424	val-clip-rmse:1.9682
[90]	train-rmse:2.46853	val-rmse:2.51852	train-clip-rmse:1.93217	val-clip-rmse:1.96647
[91]	train-rmse:2.46523	val-rmse:2.51617	train-clip-rmse:1.93	val-clip-rmse:1.96493
[92]	train-rmse:2.46292	val-rmse:2.51422	train-clip-rmse:1.92868	val-clip-rmse:1.96383
[93]	train-rmse:2.4612	val-rmse:2.51295	train-clip-rmse:1.92758	val-clip-rmse:1.96302
[94]	train-rmse:2.45949	val-rmse:2.51185	train-clip-rmse:1.92665	val-clip-rmse:1.96245
[95]	train-rmse:2.45708	val-rmse:2.50976	train-clip-rmse:1.92491	val-clip-rmse:1.96093
[96]	train-rmse:2.45557	val-rmse:2.50854	train-clip-rmse:1.92403	val-clip-rmse:1.96026
[97]	train-rmse:2.45385	val-rmse:2.50721	train-clip-rmse:1.92304	val-clip-rmse:1.95953
[98]	train-rmse:2.45231	val-rmse:2.50597	train-clip-rmse:1.92207	val-clip-rmse:1.95882
[99]	train-rmse:2.45056	val-rmse:2.50457	train-clip-rmse:1.92094	val-clip-rmse:1.95792
Repeat 0, split 1, val score = 1.958, running tim

[83]	train-rmse:2.47841	val-rmse:2.55657	train-clip-rmse:1.93874	val-clip-rmse:1.98846
[84]	train-rmse:2.4751	val-rmse:2.55394	train-clip-rmse:1.93654	val-clip-rmse:1.98666
[85]	train-rmse:2.47306	val-rmse:2.5524	train-clip-rmse:1.93543	val-clip-rmse:1.98578
[86]	train-rmse:2.47117	val-rmse:2.55107	train-clip-rmse:1.93439	val-clip-rmse:1.98504
[87]	train-rmse:2.46936	val-rmse:2.54977	train-clip-rmse:1.93347	val-clip-rmse:1.98436
[88]	train-rmse:2.46734	val-rmse:2.54841	train-clip-rmse:1.93232	val-clip-rmse:1.98358
[89]	train-rmse:2.46555	val-rmse:2.54696	train-clip-rmse:1.93121	val-clip-rmse:1.98275
[90]	train-rmse:2.46384	val-rmse:2.54565	train-clip-rmse:1.93016	val-clip-rmse:1.98192
[91]	train-rmse:2.46233	val-rmse:2.54447	train-clip-rmse:1.92932	val-clip-rmse:1.98129
[92]	train-rmse:2.4606	val-rmse:2.54321	train-clip-rmse:1.92837	val-clip-rmse:1.98061
[93]	train-rmse:2.45846	val-rmse:2.54165	train-clip-rmse:1.927	val-clip-rmse:1.97963
[94]	train-rmse:2.45661	val-rmse:2.54016	train-c

[78]	train-rmse:2.49444	val-rmse:2.54631	train-clip-rmse:1.94896	val-clip-rmse:1.97632
[79]	train-rmse:2.49127	val-rmse:2.54392	train-clip-rmse:1.94709	val-clip-rmse:1.97491
[80]	train-rmse:2.48897	val-rmse:2.54211	train-clip-rmse:1.94576	val-clip-rmse:1.97382
[81]	train-rmse:2.48684	val-rmse:2.54034	train-clip-rmse:1.94459	val-clip-rmse:1.97284
[82]	train-rmse:2.48479	val-rmse:2.53877	train-clip-rmse:1.94345	val-clip-rmse:1.97194
[83]	train-rmse:2.4829	val-rmse:2.53722	train-clip-rmse:1.94234	val-clip-rmse:1.97103
[84]	train-rmse:2.4793	val-rmse:2.53394	train-clip-rmse:1.93993	val-clip-rmse:1.96881
[85]	train-rmse:2.47723	val-rmse:2.53229	train-clip-rmse:1.93872	val-clip-rmse:1.96784
[86]	train-rmse:2.47572	val-rmse:2.53109	train-clip-rmse:1.93788	val-clip-rmse:1.9671
[87]	train-rmse:2.47359	val-rmse:2.52946	train-clip-rmse:1.93659	val-clip-rmse:1.96618
[88]	train-rmse:2.47168	val-rmse:2.52791	train-clip-rmse:1.93555	val-clip-rmse:1.96528
[89]	train-rmse:2.4696	val-rmse:2.52627	train-

KeyboardInterrupt: 

In [27]:
x_train.tail()

Unnamed: 0,date_block_num,shop_id,item_id,City,Type,item_cat1,item_cat2,price_mean_month_lag_1,price_mean_month_lag_2,price_std_month_lag_1,...,rev_mean_cat1_cat2_lag_1,rev_mean_cat1_cat2_lag_2,shop_count_city_type_lag_1,shop_count_city_type_lag_2,cnt_mean_city_type_lag_1,cnt_mean_city_type_lag_2,price_mean_city_type_lag_1,price_mean_city_type_lag_2,rev_mean_city_type_lag_1,rev_mean_city_type_lag_2
1609119,33,59,22087,27,1,7,60,119.0,119.0,0.0,...,389.940825,387.652653,491.0,593.0,1.861507,2.047218,1136.382939,894.9442,2233.918534,1684.612142
1609120,33,59,22088,27,1,7,60,119.0,119.0,,...,389.940825,387.652653,491.0,593.0,1.861507,2.047218,1136.382939,894.9442,2233.918534,1684.612142
1609121,33,59,22091,27,1,7,60,179.0,,0.0,...,389.940825,,491.0,,1.861507,,1136.382939,,2233.918534,
1609122,33,59,22100,27,1,2,3,629.0,,,...,3722.749814,,491.0,,1.861507,,1136.382939,,2233.918534,
1609123,33,59,22102,27,1,2,3,,1250.0,,...,,3218.079588,,593.0,,2.047218,,894.9442,,1684.612142
