In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time
import gc
import itertools
import tqdm
import time
import copy

import scipy.stats as ss
from numba import jit

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

%matplotlib inline

### Helper Function

In [2]:
def lag_features(df, lags, merge_cols, shift_cols, fillna_value=None):
    '''create lag features of col'''
    cols = copy.copy(merge_cols)
    cols.extend(shift_cols)
    tmp = df.loc[:, cols]
    for i in lags:
        shifted = tmp.copy()
        shifted_cols = shifted.columns.tolist()
        shifted_cols = [c+'_lag_'+str(i) if c in shift_cols else c 
                        for c in shifted_cols]
        shifted.columns = shifted_cols
        shifted[merge_cols] += i
        shifted.drop_duplicates(inplace=True)
        df = pd.merge(df, shifted, on=merge_cols, how='left')
    if fillna_value is not None:
        df.fillna(fillna_value, inplace=True)
    return df    

In [3]:
def clip_rmse(preds, dtrain):
    y_test = np.array(dtrain.get_label())
    preds = np.array(preds)
    y_test = np.maximum(np.minimum(y_test, 20), 0)
    preds = np.maximum(np.minimum(preds, 20), 0)
    #preds = np.array(list(map(lambda x: min(20, max(x, 0)), list(preds))))
    #y_test = np.array(list(map(lambda x: min(20, max(x, 0)), list(y_test))))
    rmse = np.sqrt(mean_squared_error(preds,y_test))
    return 'clip-rmse', rmse

In [4]:
def cv(x_train, y_train, param, n_repetition, n_split, n_tree, verbose, random_state):
    '''Repeated CV'''
    
    cv_results = {}
    clf = {}
    running_time = {}

    np.random.seed(random_state)

    for m in range(n_repetition):
        # Train and valuation sets split
        skf = StratifiedKFold(n_splits=n_split, random_state=np.random.randint(10**6), shuffle=True)

        for n, (train_index, val_index) in enumerate(skf.split(x_train, y_train)):

            start_time = time.time()

            # Construct DMatrix
            dtrain = xgb.DMatrix(x_train.iloc[train_index], label=y_train.iloc[train_index])
            dval = xgb.DMatrix(x_train.iloc[val_index], label=y_train.iloc[val_index])

            # Placeholder for evals_results
            cv_results[m, n] = {}

            param['seed'] = np.random.randint(10**6)
            clf[m, n] = xgb.train(param, dtrain,num_boost_round=n_tree, 
                                  evals=[(dtrain, 'train'), (dval, 'val')], 
                                  feval=clip_rmse, maximize=False, early_stopping_rounds=None, 
                                  evals_result=cv_results[m, n], verbose_eval=verbose)

            running_time[m, n] = time.time() - start_time

            print('Repeat {}, split {}, val score = {:.3f}, running time = {:.3f} min.'.format(m, n, 
                cv_results[m, n]['val']['clip-rmse'][-1], running_time[m, n]/60))

    cv_results_final = {}
    for m in range(n_repetition):
        for n in range(n_split):
            cv_results_final['train', m, n] = cv_results[m, n]['train']['clip-rmse']
            cv_results_final['val', m, n] = cv_results[m, n]['val']['clip-rmse']

    df = pd.DataFrame(cv_results_final)
    df.index.name = 'iteration'
    df.columns.names = ['dataset', 'repetition', 'cv_split']

    print('Val mean = {:.3f}, std = {:.3f}'.format(df['val'].iloc[-1].mean(), df['val'].iloc[-1].std()))
    
    return df, clf, running_time

In [5]:
def feature_combination(x, feature_list, function_dict, column_name, merge=False):
    '''Combination of new features'''
    tmp = x.groupby(feature_list).agg(function_dict)
    tmp.columns = column_name
    if merge:
        x = x.merge(tmp, on=feature_list, how='left')
        return x, tmp
    else:
        return tmp

### Load and Preprocess Data

In [6]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

test.set_index('ID', inplace=True)
item.drop(['item_name_translated'], axis=1, inplace=True)
shop.drop(['Name'], axis=1, inplace=True)

le = LabelEncoder()
item['item_cat1'] = le.fit_transform(item['item_cat1'].astype(str))
item['item_cat2'] = le.fit_transform(item['item_cat2'].astype(str))
shop['City'] = le.fit_transform(shop['City'])
shop['Type'] = le.fit_transform(shop['Type'])

Remove outliers

In [7]:
train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]
median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()
train.loc[train.item_price<0, 'item_price'] = median

Fix shop names and ids

In [8]:
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

Monthly sales for all the samples

In [9]:
x = pd.DataFrame(train.groupby(['date_block_num', 'shop_id', 'item_id'])['item_cnt_day'].sum())

x.reset_index(inplace=True)

In [10]:
test['date_block_num'] = 34

In [11]:
x = pd.concat((x, test), sort=False).fillna(0.0)

Add shop/item features

In [12]:
x = pd.merge(x, shop, on='shop_id', how='left')
x = pd.merge(x, item, on='item_id', how='left')

Change column names

In [13]:
x.columns = ['month', 'shop_id', 'item_id', 'sales_month', 
             'City', 'Type', 'item_cat1', 'item_cat2']

Change column types

In [14]:
x['City'] = x['City'].astype(np.int8)
x['Type'] = x['Type'].astype(np.int8)
x['item_cat1'] = x['item_cat1'].astype(np.int8)
x['item_cat2'] = x['item_cat2'].astype(np.int8)
x['sales_month'] = x['sales_month'].astype(np.float16)
x['month'] = x['month'].astype(np.int8)
x['shop_id'] = x['shop_id'].astype(np.int8)

### Target Lags

In [15]:
x = lag_features(x, [1, 2, 3, 6, 12], 
                 ['month', 'shop_id', 'item_id'], 
                 ['sales_month'], fillna_value=0)

### Mean Encoding

`sales_mean_month`

In [17]:
group = x.groupby('month').agg({'sales_month': ['mean']})
group.columns = ['sales_mean_month']
group.reset_index(inplace=True)

x = pd.merge(x, group, on=['month'], how='left')
x['sales_mean_month'] = x['sales_mean_month'].astype(np.float16)
x = lag_features(x, [1], ['month'], ['sales_mean_month'], fillna_value=0.0)
x.drop(['sales_mean_month'], axis=1, inplace=True)

`sales_mean_month_item`

In [19]:
col = 'sales_mean_month_item'
group = x.groupby(['month', 'item_id']).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=['month', 'item_id'], how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1, 2, 3, 6, 12], 
                 ['month', 'item_id'], 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_shop`

In [22]:
col = 'sales_mean_month_shop'
merge_cols = ['month', 'shop_id']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1, 2, 3, 6, 12], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_cat1`

In [24]:
col = 'sales_mean_month_cat1'
merge_cols = ['month', 'item_cat1']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_shop_cat1`

In [27]:
col = 'sales_mean_month_shop_cat1'
merge_cols = ['month', 'shop_id', 'item_cat1']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_shop_type`

In [30]:
col = 'sales_mean_month_shop_type'
merge_cols = ['month', 'shop_id', 'Type']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_shop_cat2`

In [33]:
col = 'sales_mean_month_shop_cat2'
merge_cols = ['month', 'shop_id', 'item_cat2']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_city`

In [36]:
col = 'sales_mean_month_city'
merge_cols = ['month', 'City']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_item_city`

In [37]:
col = 'sales_mean_month_item_city'
merge_cols = ['month', 'item_id', 'City']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_cat2`

In [38]:
col = 'sales_mean_month_cat2'
merge_cols = ['month', 'item_cat2']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_cat1_cat2`

In [39]:
col = 'sales_mean_month_cat1_cat2'
merge_cols = ['month', 'item_cat1', 'item_cat2']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1, 2, 3, 6], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

#### CV

In [49]:
x.replace([np.inf, -np.inf], np.nan, inplace=True)
x.fillna(0.0, inplace=True)

In [54]:
# Split train and test sets
x_train = x.loc[(x['month']<=33) & (x['month']>=12), :].copy()
x_test = x.loc[x['month']==34, :].copy()

# Drop target from test set
x_test.drop(['month'], axis=1, inplace=True)

# Split target from train set
# Note that target is first clipped to (0, 40), then clipped to (0, 20) in test set. 
# This is similar to the idea of calibration
y_train = x_train['sales_month'].clip(0, 20)
x_train.drop(['sales_month'], axis=1, inplace=True)

In [55]:
param = {'max_depth':8, 
         'subsample':0.8,
         'min_child_weight':300,
         'eta':0.3, 
         'lambda':2,
         'colsample_bytree':0.8,
         'seed':1,
         'silent':1,
         'maximize': False,
         'nthread':8}

n_tree = 80
verbose = True

n_split = 3
n_repetition = 1

df, clf, running_time = cv(x_train, y_train, param, n_repetition, n_split, n_tree, verbose, 42)

  return umr_sum(a, axis, dtype, out, keepdims)


[0]	train-rmse:2.65226	val-rmse:2.65368	train-clip-rmse:2.65226	val-clip-rmse:2.65368
[1]	train-rmse:2.45934	val-rmse:2.46022	train-clip-rmse:2.45934	val-clip-rmse:2.46022
[2]	train-rmse:2.35618	val-rmse:2.35648	train-clip-rmse:2.35618	val-clip-rmse:2.35648
[3]	train-rmse:2.30456	val-rmse:2.30485	train-clip-rmse:2.30456	val-clip-rmse:2.30485
[4]	train-rmse:2.25256	val-rmse:2.25482	train-clip-rmse:2.25256	val-clip-rmse:2.25482
[5]	train-rmse:2.22492	val-rmse:2.22807	train-clip-rmse:2.22492	val-clip-rmse:2.22807
[6]	train-rmse:2.21374	val-rmse:2.21725	train-clip-rmse:2.21374	val-clip-rmse:2.21725
[7]	train-rmse:2.19072	val-rmse:2.19603	train-clip-rmse:2.19072	val-clip-rmse:2.19603
[8]	train-rmse:2.18074	val-rmse:2.18679	train-clip-rmse:2.18074	val-clip-rmse:2.18679
[9]	train-rmse:2.17177	val-rmse:2.17905	train-clip-rmse:2.17177	val-clip-rmse:2.17905
[10]	train-rmse:2.15772	val-rmse:2.16608	train-clip-rmse:2.15772	val-clip-rmse:2.16608
[11]	train-rmse:2.15069	val-rmse:2.15933	train-clip-r

[14]	train-rmse:2.12931	val-rmse:2.14959	train-clip-rmse:2.12931	val-clip-rmse:2.14959
[15]	train-rmse:2.12509	val-rmse:2.14556	train-clip-rmse:2.12509	val-clip-rmse:2.14556
[16]	train-rmse:2.11407	val-rmse:2.13475	train-clip-rmse:2.11407	val-clip-rmse:2.13475
[17]	train-rmse:2.11064	val-rmse:2.13158	train-clip-rmse:2.11064	val-clip-rmse:2.13158
[18]	train-rmse:2.10697	val-rmse:2.12825	train-clip-rmse:2.10697	val-clip-rmse:2.12825
[19]	train-rmse:2.09707	val-rmse:2.11744	train-clip-rmse:2.09707	val-clip-rmse:2.11744
[20]	train-rmse:2.09202	val-rmse:2.11298	train-clip-rmse:2.09202	val-clip-rmse:2.11297
[21]	train-rmse:2.08081	val-rmse:2.10223	train-clip-rmse:2.0808	val-clip-rmse:2.10222
[22]	train-rmse:2.0776	val-rmse:2.09959	train-clip-rmse:2.0776	val-clip-rmse:2.09958
[23]	train-rmse:2.07358	val-rmse:2.09571	train-clip-rmse:2.07358	val-clip-rmse:2.0957
[24]	train-rmse:2.07113	val-rmse:2.09366	train-clip-rmse:2.07112	val-clip-rmse:2.09365
[25]	train-rmse:2.07079	val-rmse:2.09345	train-

[29]	train-rmse:2.06139	val-rmse:2.07267	train-clip-rmse:2.06109	val-clip-rmse:2.07227
[30]	train-rmse:2.05398	val-rmse:2.06623	train-clip-rmse:2.05368	val-clip-rmse:2.06582
[31]	train-rmse:2.04944	val-rmse:2.06234	train-clip-rmse:2.04913	val-clip-rmse:2.06193
[32]	train-rmse:2.04858	val-rmse:2.0615	train-clip-rmse:2.04827	val-clip-rmse:2.06109
[33]	train-rmse:2.04466	val-rmse:2.05795	train-clip-rmse:2.04435	val-clip-rmse:2.05753
[34]	train-rmse:2.04031	val-rmse:2.05355	train-clip-rmse:2.04	val-clip-rmse:2.05313
[35]	train-rmse:2.03775	val-rmse:2.05118	train-clip-rmse:2.03742	val-clip-rmse:2.05075
[36]	train-rmse:2.03526	val-rmse:2.04882	train-clip-rmse:2.03494	val-clip-rmse:2.04839
[37]	train-rmse:2.03441	val-rmse:2.04811	train-clip-rmse:2.03408	val-clip-rmse:2.04768
[38]	train-rmse:2.03002	val-rmse:2.04455	train-clip-rmse:2.02966	val-clip-rmse:2.0441
[39]	train-rmse:2.02563	val-rmse:2.04058	train-clip-rmse:2.02527	val-clip-rmse:2.04013
[40]	train-rmse:2.02257	val-rmse:2.03758	train-c

In [55]:
a = {k: clf[0, 0].get_score(importance_type='gain') for k in clf.keys()}

In [56]:
b = pd.DataFrame(a)

In [57]:
b.columns = list(range(n_split))

In [58]:
c = b.mean(axis=1).sort_values(ascending=False)

In [59]:
d = c.head(50).index.tolist()
e = []
f = []
for n in d:
    if len(n.split('_'))>=2 and n.split('_')[-2] == 'lag':
        e.append('_'.join(n.split('_')[:-2]))
        f.append(int(n.split('_')[-1]))
        
e = set(e)
f = set(f)

In [60]:
e

{'cnt_mean_cat1_cat2',
 'cnt_mean_cat1_shop',
 'cnt_mean_cat1_type',
 'cnt_mean_cat2_type',
 'cnt_mean_item_city',
 'cnt_sum_month',
 'price_mean_cat1_cat2',
 'price_mean_cat1_type',
 'price_mean_item_city',
 'price_mean_month',
 'rev_mean_cat1_cat2',
 'rev_mean_cat1_city',
 'rev_mean_cat1_type',
 'rev_mean_cat2_type',
 'rev_mean_item_city',
 'rev_mean_item_type',
 'rev_sum_month',
 'shop_count_cat1_cat2'}

In [61]:
f

{1, 2, 3, 4, 5, 6}

In [62]:
26*10

260

In [64]:
b.to_csv('eda_11_5_feature_importance.csv')