In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time
import gc
import itertools
import tqdm
import time
import copy

import scipy.stats as ss
from numba import jit

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

%matplotlib inline

### Helper Function

In [2]:
def lag_features(df, lags, col):
    '''create lag features of col'''
    cols_idx = ['date_block_num', 'shop_id', 'item_id']
    cols_f = ['date_block_num', 'shop_id', 'item_id', col]
    tmp = df.loc[:, cols_f]
    for i in lags:
        shifted = tmp.copy()
        shifted_cols = shifted.columns.tolist()
        shifted_cols[-1] = col+'_lag_'+str(i)
        shifted.columns = shifted_cols
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=cols_idx, how='left')
    return df    

In [3]:
def clip_rmse(preds, dtrain):
    y_test = np.array(dtrain.get_label())
    preds = np.array(preds)
    y_test = np.maximum(np.minimum(y_test, 20), 0)
    preds = np.maximum(np.minimum(preds, 20), 0)
    #preds = np.array(list(map(lambda x: min(20, max(x, 0)), list(preds))))
    #y_test = np.array(list(map(lambda x: min(20, max(x, 0)), list(y_test))))
    rmse = np.sqrt(mean_squared_error(preds,y_test))
    return 'clip-rmse', rmse

In [4]:
def cv(x_train, y_train, param, n_repetition, n_split, n_tree, verbose, random_state):
    '''Repeated CV'''
    
    cv_results = {}
    clf = {}
    running_time = {}

    np.random.seed(random_state)

    for m in range(n_repetition):
        # Train and valuation sets split
        skf = StratifiedKFold(n_splits=n_split, random_state=np.random.randint(10**6), shuffle=True)

        for n, (train_index, val_index) in enumerate(skf.split(x_train, y_train)):

            start_time = time.time()

            # Construct DMatrix
            dtrain = xgb.DMatrix(x_train.iloc[train_index], label=y_train.iloc[train_index])
            dval = xgb.DMatrix(x_train.iloc[val_index], label=y_train.iloc[val_index])

            # Placeholder for evals_results
            cv_results[m, n] = {}

            param['seed'] = np.random.randint(10**6)
            clf[m, n] = xgb.train(param, dtrain,num_boost_round=n_tree, 
                                  evals=[(dtrain, 'train'), (dval, 'val')], 
                                  feval=clip_rmse, maximize=False, early_stopping_rounds=None, 
                                  evals_result=cv_results[m, n], verbose_eval=verbose)

            running_time[m, n] = time.time() - start_time

            print('Repeat {}, split {}, val score = {:.3f}, running time = {:.3f} min.'.format(m, n, 
                cv_results[m, n]['val']['clip-rmse'][-1], running_time[m, n]/60))

    cv_results_final = {}
    for m in range(n_repetition):
        for n in range(n_split):
            cv_results_final['train', m, n] = cv_results[m, n]['train']['clip-rmse']
            cv_results_final['val', m, n] = cv_results[m, n]['val']['clip-rmse']

    df = pd.DataFrame(cv_results_final)
    df.index.name = 'iteration'
    df.columns.names = ['dataset', 'repetition', 'cv_split']

    print('Val mean = {:.3f}, std = {:.3f}'.format(df['val'].iloc[-1].mean(), df['val'].iloc[-1].std()))
    
    return df, clf, running_time

In [5]:
def feature_combination(x, feature_list, function_dict, column_name, merge=False):
    '''Combination of new features'''
    tmp = x.groupby(feature_list).agg(function_dict)
    tmp.columns = column_name
    if merge:
        x = x.merge(tmp, on=feature_list, how='left')
        return x, tmp
    else:
        return tmp

### Load Data

In [6]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

test.set_index('ID', inplace=True)
item.drop(['item_name_translated'], axis=1, inplace=True)
shop.drop(['Name'], axis=1, inplace=True)

le = LabelEncoder()
item['item_cat1'] = le.fit_transform(item['item_cat1'].astype(str))
item['item_cat2'] = le.fit_transform(item['item_cat2'].astype(str))
shop['City'] = le.fit_transform(shop['City'])
shop['Type'] = le.fit_transform(shop['Type'])

In [7]:
# Add date_block_num to test 
test['date_block_num'] = 34
test = test[['date_block_num', 'shop_id', 'item_id']]

In [8]:
train = train.merge(shop, on='shop_id', how='left')
train = train.merge(item, on='item_id', how='left')
train['revenue'] = train.item_price * train.item_cnt_day

### Experiments

In [9]:
x = train.groupby(['date_block_num', 
                   'shop_id', 
                   'item_id']).agg({
    'item_price':[np.mean, np.std], 'item_cnt_day': np.sum, 'revenue': np.sum})
# x.columns = ['price_mean_month', 'cnt_sum_month']

#cols = ['price_mean_month', 'price_median_month', 'price_std_month', 'cnt_sum_month']
cols = ['price_mean_month', 'price_std_month', 'cnt_sum_month', 'rev_sum_month']
x.columns = cols

x.reset_index(inplace=True)
x = pd.concat((x, test), sort=False, ignore_index=True)
x.reset_index(drop=True, inplace=True)

x = pd.merge(x, shop, on='shop_id', how='left')
x = pd.merge(x, item, on='item_id', how='left')

In [10]:
tmp = x.columns.tolist()
tmp[3:3] = ['City', 'Type', 'item_cat1', 'item_cat2']
tmp = tmp[:11]
x = x[tmp]

In [11]:
a = pd.read_csv('eda_11_5_feature_importance.csv', usecols=[0, 1])
a.columns = ['feature_name', 'score']
a.sort_values(by='score', ascending=False, inplace=True)

#### Lag of `['price_mean_month', 'price_std_month', 'cnt_sum_month', 'rev_sum_month']`

In [12]:
for c in tqdm.tqdm_notebook(cols):
    x = lag_features(x, range(1, 7), c)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




#### Create combinations of features

##### `(item_id, Type)`

- count of shops with the same type and selling the same item, i.e., having the same `(item_id, Type)`
- average sales count among these shops

In [13]:
for i in range(len(a)):
    if 'item_type' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
        break
# (item, type) is not good

43 rev_mean_item_type_lag_1 205.1912295064547


In [14]:
feature_list = ['date_block_num', 'item_id', 'Type']
function_list = {'rev_sum_month':np.mean}
column_name = ['rev_mean_item_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [15]:
for c in column_name:
    x = lag_features(x, [1], c)

##### `(item_id, City)`

In [16]:
for i in range(len(a)):
    if 'item_city' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (item, city) keeps cnt_mean, rev_mean, price_mean, shop_count
# lag = 1, 2, 5

15 price_mean_item_city_lag_1 407.2581965373557
17 cnt_mean_item_city_lag_1 402.0346140541581
32 rev_mean_item_city_lag_1 247.7419952194
42 cnt_mean_item_city_lag_2 207.6280725768317


In [None]:
feature_list = ['date_block_num', 'item_id', 'City']
function_list = {'cnt_sum_month':'mean', 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['cnt_mean_item_city', 'price_mean_item_city', 'rev_mean_item_city']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [None]:
for c in column_name:
    x = lag_features(x, [1, 2], c)

##### `(cat1, shop)`

In [19]:
for i in range(len(a)):
    if 'cat1_shop' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (cat1, shop) keeps cnt_mean
# lag = 1

29 cnt_mean_cat1_shop_lag_1 110.13483789434125


In [20]:
feature_list = ['date_block_num', 'item_cat1', 'shop_id']
function_list = {'cnt_sum_month':'mean'}
column_name = ['cnt_mean_cat1_shop']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [21]:
for c in column_name:
    x = lag_features(x, [1], c)

##### `(cat1, City)`

In [22]:
for i in range(len(a)):
    if 'cat1_city' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (cat1, city) keeps rev_mean
# lag = 2

41 rev_mean_cat1_city_lag_2 95.76644741700045


In [23]:
feature_list = ['date_block_num', 'item_cat1', 'City']
function_list = {'rev_sum_month':np.mean}
column_name = ['rev_mean_cat1_city']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [24]:
for c in column_name:
    x = lag_features(x, [1, 2], c)

##### `(cat1, Type)`

In [25]:
for i in range(len(a)):
    if 'cat1_type' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (cat1, type) keeps cnt_mean, rev_mean, price_mean, shop_count
# lag = 1, 2, 3

22 rev_mean_cat1_type_lag_1 131.5846505113204
24 price_mean_cat1_type_lag_1 124.85491306936169
38 cnt_mean_cat1_type_lag_1 96.7988390802824
46 cnt_mean_cat1_type_lag_3 89.34707166635323
49 shop_count_cat1_type_lag_2 87.08559819783136


In [26]:
feature_list = ['date_block_num', 'item_cat1', 'Type']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_type', 'cnt_mean_cat1_type', 'price_mean_cat1_type', 'rev_mean_cat1_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [27]:
for c in column_name:
    x = lag_features(x, [1, 2, 3], c)

##### `(cat2, shop)`

In [28]:
for i in range(len(a)):
    if 'cat2_shop' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (cat2, shop) is not good

In [29]:
#feature_list = ['date_block_num', 'item_cat2', 'shop_id']
#function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
#column_name = ['shop_count_cat2_shop', 'cnt_mean_cat2_shop', 'price_mean_cat2_shop', 'rev_mean_cat2_shop']
#cols.extend(column_name)
#x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat2, city)`

In [30]:
for i in range(len(a)):
    if 'cat2_city' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (cat2, city) keeps shop_count
# lag = 8

12 shop_count_cat2_city_lag_8 265.7488978910201


In [31]:
#feature_list = ['date_block_num', 'item_cat2', 'City']
#function_list = {'cnt_sum_month':'count'}
#column_name = ['shop_count_cat2_city']
#cols.extend(column_name)
#x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [32]:
#for c in column_name:
#    x = lag_features(x, [1, 2, 3, 4, 5, 6, 7, 8], c)

##### `(cat2, type)`

In [33]:
for i in range(len(a)):
    if 'cat2_type' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break

17 price_mean_cat2_type_lag_1 175.68014164708492
18 rev_mean_cat2_type_lag_3 168.95614200670346
32 rev_mean_cat2_type_lag_1 105.55702993735979
35 cnt_mean_cat2_type_lag_4 102.2163544836913
50 cnt_mean_cat2_type_lag_10 86.86068853386844
51 cnt_mean_cat2_type_lag_2 86.70648924048363


In [34]:
feature_list = ['date_block_num', 'item_cat2', 'Type']
function_list = {'cnt_sum_month':'mean', 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['cnt_mean_cat2_type', 'price_mean_cat2_type', 'rev_mean_cat2_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [35]:
for c in column_name:
    x = lag_features(x, [1, 2, 3, 4], c)

##### `(cat1, cat2)`

In [36]:
for i in range(len(a)):
    if 'cat1_cat2' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break

1 cnt_mean_cat1_cat2_lag_1 2225.2410129612413
4 shop_count_cat1_cat2_lag_2 444.35774689280294
5 cnt_mean_cat1_cat2_lag_2 369.8038679774178
9 price_mean_cat1_cat2_lag_6 291.7342651120839
14 price_mean_cat1_cat2_lag_1 255.2306733343093
15 rev_mean_cat1_cat2_lag_1 243.23460913270299
19 shop_count_cat1_cat2_lag_3 159.70604870486397
30 rev_mean_cat1_cat2_lag_2 108.58749733343019
33 shop_count_cat1_cat2_lag_1 102.61773123032671
34 rev_mean_cat1_cat2_lag_10 102.2174760845
39 price_mean_cat1_cat2_lag_2 96.01661601979878
42 shop_count_cat1_cat2_lag_5 95.44898210324594


In [37]:
feature_list = ['date_block_num', 'item_cat1', 'item_cat2']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_cat2', 'cnt_mean_cat1_cat2', 'price_mean_cat1_cat2', 'rev_mean_cat1_cat2']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [38]:
for c in column_name:
    x = lag_features(x, [1, 2, 3, 4, 5, 6], c)

##### `(city, type)`

In [39]:
for i in range(len(a)):
    if 'city_type' in a.iloc[i, 0]:
        print(i, a.iloc[i, 0], a.iloc[i, 1])
    if i>50:
        break
# (city, type) is not good

26 price_mean_city_type_lag_12 117.35751098371831
36 rev_mean_city_type_lag_5 102.06072545810963


In [40]:
#feature_list = ['date_block_num', 'City', 'Type']
#function_list = {'price_mean_month':np.mean, 'rev_sum_month':np.mean}
#column_name = ['price_mean_city_type', 'rev_mean_city_type']
#cols.extend(column_name)
#x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

In [41]:
#for c in column_name:
#    x = lag_features(x, [1, 5, 12], c)

Remove value for the current month, since for test data we do not have this information

In [46]:
cols.remove('cnt_sum_month')
x.drop(cols, axis=1, inplace=True)

#### CV

In [48]:
# Split train and test sets
x_train = x.loc[(x['date_block_num']<=33) & (x['date_block_num']>=12), :].copy()
x_test = x.loc[x['date_block_num']==34, :].copy()

# Drop target from test set
x_test.drop(['cnt_sum_month'], axis=1, inplace=True)

# Split target from train set
# Note that target is first clipped to (0, 40), then clipped to (0, 20) in test set. 
# This is similar to the idea of calibration
y_train = x_train['cnt_sum_month'].clip(0, 40)
x_train.drop(['cnt_sum_month'], axis=1, inplace=True)

In [54]:
param = {'max_depth':8, 
         'subsample':0.8,
         'min_child_weight':5,
         'eta':0.6, 
         'lambda':2,
         'colsample_bytree':0.8,
         'seed':1,
         'silent':1,
         'maximize': False,
         'nthread':8}

n_tree = 50
verbose = True

n_split = 3
n_repetition = 1

df, clf, running_time = cv(x_train, y_train, param, n_repetition, n_split, n_tree, verbose, 42)

[0]	train-rmse:2.70818	val-rmse:2.72908	train-clip-rmse:2.10916	val-clip-rmse:2.13311
[1]	train-rmse:2.49598	val-rmse:2.53852	train-clip-rmse:1.96729	val-clip-rmse:2.00585
[2]	train-rmse:2.40677	val-rmse:2.46412	train-clip-rmse:1.91022	val-clip-rmse:1.95527
[3]	train-rmse:2.35775	val-rmse:2.43248	train-clip-rmse:1.88285	val-clip-rmse:1.93753
[4]	train-rmse:2.32685	val-rmse:2.40973	train-clip-rmse:1.86688	val-clip-rmse:1.92427
[5]	train-rmse:2.31498	val-rmse:2.40099	train-clip-rmse:1.85887	val-clip-rmse:1.91805
[6]	train-rmse:2.29764	val-rmse:2.38975	train-clip-rmse:1.84811	val-clip-rmse:1.9104
[7]	train-rmse:2.28012	val-rmse:2.37585	train-clip-rmse:1.83606	val-clip-rmse:1.90048
[8]	train-rmse:2.22227	val-rmse:2.32743	train-clip-rmse:1.80245	val-clip-rmse:1.87431
[9]	train-rmse:2.21223	val-rmse:2.32139	train-clip-rmse:1.79366	val-clip-rmse:1.86862
[10]	train-rmse:2.20059	val-rmse:2.31608	train-clip-rmse:1.7844	val-clip-rmse:1.86436
[11]	train-rmse:2.1924	val-rmse:2.313	train-clip-rmse:1

[45]	train-rmse:1.88959	val-rmse:2.15396	train-clip-rmse:1.58643	val-clip-rmse:1.76857
[46]	train-rmse:1.88424	val-rmse:2.15334	train-clip-rmse:1.58211	val-clip-rmse:1.76827
[47]	train-rmse:1.88073	val-rmse:2.15436	train-clip-rmse:1.57985	val-clip-rmse:1.76882
[48]	train-rmse:1.8781	val-rmse:2.15517	train-clip-rmse:1.57759	val-clip-rmse:1.76914
[49]	train-rmse:1.87602	val-rmse:2.15486	train-clip-rmse:1.57601	val-clip-rmse:1.76894
Repeat 0, split 1, val score = 1.769, running time = 0.774 min.
[0]	train-rmse:2.71845	val-rmse:2.72506	train-clip-rmse:2.12001	val-clip-rmse:2.12734
[1]	train-rmse:2.48753	val-rmse:2.51275	train-clip-rmse:1.96311	val-clip-rmse:1.98129
[2]	train-rmse:2.42092	val-rmse:2.46043	train-clip-rmse:1.9237	val-clip-rmse:1.95057
[3]	train-rmse:2.38076	val-rmse:2.43506	train-clip-rmse:1.89988	val-clip-rmse:1.9344
[4]	train-rmse:2.34162	val-rmse:2.40409	train-clip-rmse:1.87305	val-clip-rmse:1.91358
[5]	train-rmse:2.32325	val-rmse:2.3925	train-clip-rmse:1.85988	val-clip-rm

In [55]:
a = {k: clf[0, 0].get_score(importance_type='gain') for k in clf.keys()}

In [56]:
b = pd.DataFrame(a)

In [57]:
b.columns = list(range(n_split))

In [58]:
c = b.mean(axis=1).sort_values(ascending=False)

In [59]:
d = c.head(50).index.tolist()
e = []
f = []
for n in d:
    if len(n.split('_'))>=2 and n.split('_')[-2] == 'lag':
        e.append('_'.join(n.split('_')[:-2]))
        f.append(int(n.split('_')[-1]))
        
e = set(e)
f = set(f)

In [60]:
e

{'cnt_mean_cat1_cat2',
 'cnt_mean_cat1_shop',
 'cnt_mean_cat1_type',
 'cnt_mean_cat2_type',
 'cnt_mean_item_city',
 'cnt_sum_month',
 'price_mean_cat1_cat2',
 'price_mean_cat1_type',
 'price_mean_item_city',
 'price_mean_month',
 'rev_mean_cat1_cat2',
 'rev_mean_cat1_city',
 'rev_mean_cat1_type',
 'rev_mean_cat2_type',
 'rev_mean_item_city',
 'rev_mean_item_type',
 'rev_sum_month',
 'shop_count_cat1_cat2'}

In [61]:
f

{1, 2, 3, 4, 5, 6}

In [62]:
26*10

260

In [64]:
b.to_csv('eda_11_5_feature_importance.csv')