In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time
import gc
import itertools
import tqdm
import time
import copy

import scipy.stats as ss
from numba import jit

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

%matplotlib inline

### Helper Function

In [2]:
def lag_features(df, lags, col):
    '''create lag features of col'''
    cols_idx = ['date_block_num', 'shop_id', 'item_id']
    cols_f = ['date_block_num', 'shop_id', 'item_id', col]
    tmp = df.loc[:, cols_f]
    for i in lags:
        shifted = tmp.copy()
        shifted_cols = shifted.columns.tolist()
        shifted_cols[-1] = col+'_lag_'+str(i)
        shifted.columns = shifted_cols
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=cols_idx, how='left')
    return df    

In [3]:
def clip_rmse(preds, dtrain):
    y_test = np.array(dtrain.get_label())
    preds = np.array(preds)
    y_test = np.maximum(np.minimum(y_test, 20), 0)
    preds = np.maximum(np.minimum(preds, 20), 0)
    #preds = np.array(list(map(lambda x: min(20, max(x, 0)), list(preds))))
    #y_test = np.array(list(map(lambda x: min(20, max(x, 0)), list(y_test))))
    rmse = np.sqrt(mean_squared_error(preds,y_test))
    return 'clip-rmse', rmse

In [4]:
def cv(param, n_repetition, n_split, n_tree, verbose, random_state):
    '''Repeated CV'''
    
    cv_results = {}
    clf = {}
    running_time = {}

    np.random.seed(random_state)

    for m in range(n_repetition):
        # Train and valuation sets split
        skf = StratifiedKFold(n_splits=n_split, random_state=np.random.randint(10**6), shuffle=True)

        for n, (train_index, val_index) in enumerate(skf.split(x_train, y_train)):

            start_time = time.time()

            # Construct DMatrix
            dtrain = xgb.DMatrix(x_train.iloc[train_index], label=y_train.iloc[train_index])
            dval = xgb.DMatrix(x_train.iloc[val_index], label=y_train.iloc[val_index])

            # Placeholder for evals_results
            cv_results[m, n] = {}

            param['seed'] = np.random.randint(10**6)
            clf[m, n] = xgb.train(param, dtrain,num_boost_round=n_tree, 
                                  evals=[(dtrain, 'train'), (dval, 'val')], 
                                  feval=clip_rmse, maximize=False, early_stopping_rounds=None, 
                                  evals_result=cv_results[m, n], verbose_eval=verbose)

            running_time[m, n] = time.time() - start_time

            print('Repeat {}, split {}, val score = {:.3f}, running time = {:.3f} min.'.format(m, n, 
                cv_results[m, n]['val']['clip-rmse'][-1], running_time[m, n]/60))

    cv_results_final = {}
    for m in range(n_repetition):
        for n in range(n_split):
            cv_results_final['train', m, n] = cv_results[m, n]['train']['clip-rmse']
            cv_results_final['val', m, n] = cv_results[m, n]['val']['clip-rmse']

    df = pd.DataFrame(cv_results_final)
    df.index.name = 'iteration'
    df.columns.names = ['dataset', 'repetition', 'cv_split']

    print('Val mean = {:.3f}, std = {:.3f}'.format(df['val'].iloc[-1].mean(), df['val'].iloc[-1].std()))
    
    return df, clf, running_time

In [5]:
def feature_combination(x, feature_list, function_dict, column_name, merge=False):
    '''Combination of new features'''
    tmp = x.groupby(feature_list).agg(function_dict)
    tmp.columns = column_name
    if merge:
        x = x.merge(tmp, on=feature_list, how='left')
        return x, tmp
    else:
        return tmp

### Load Data

In [6]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

test.set_index('ID', inplace=True)
item.drop(['item_name_translated'], axis=1, inplace=True)
shop.drop(['Name'], axis=1, inplace=True)

le = LabelEncoder()
item['item_cat1'] = le.fit_transform(item['item_cat1'].astype(str))
item['item_cat2'] = le.fit_transform(item['item_cat2'].astype(str))
shop['City'] = le.fit_transform(shop['City'])
shop['Type'] = le.fit_transform(shop['Type'])

In [7]:
# Add date_block_num to test 
test['date_block_num'] = 34
test = test[['date_block_num', 'shop_id', 'item_id']]

In [8]:
train = train.merge(shop, on='shop_id', how='left')
train = train.merge(item, on='item_id', how='left')
train['revenue'] = train.item_price * train.item_cnt_day

### Experiments

In [9]:
x = train.groupby(['date_block_num', 
                   'shop_id', 
                   'item_id']).agg({
    'item_price':[np.mean, np.std], 'item_cnt_day': np.sum, 'revenue': np.sum})
# x.columns = ['price_mean_month', 'cnt_sum_month']

#cols = ['price_mean_month', 'price_median_month', 'price_std_month', 'cnt_sum_month']
cols = ['price_mean_month', 'price_std_month', 'cnt_sum_month', 'rev_sum_month']
x.columns = cols

x.reset_index(inplace=True)
x = pd.concat((x, test), sort=False, ignore_index=True)
x.reset_index(drop=True, inplace=True)

x = pd.merge(x, shop, on='shop_id', how='left')
x = pd.merge(x, item, on='item_id', how='left')

In [10]:
tmp = x.columns.tolist()
tmp[3:3] = ['City', 'Type', 'item_cat1', 'item_cat2']
tmp = tmp[:11]
x = x[tmp]

#### Create combinations of features

##### `(item_id, Type)`

- count of shops with the same type and selling the same item, i.e., having the same `(item_id, Type)`
- average sales count among these shops

In [11]:
feature_list = ['date_block_num', 'item_id', 'Type']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_item_type', 'cnt_mean_item_type', 'price_mean_item_type', 'rev_mean_item_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(item_id, City)`

In [12]:
feature_list = ['date_block_num', 'item_id', 'City']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_item_city', 'cnt_mean_item_city', 'price_mean_item_city', 'rev_mean_item_city']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat1, shop)`

In [13]:
feature_list = ['date_block_num', 'item_cat1', 'shop_id']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_shop', 'cnt_mean_cat1_shop', 'price_mean_cat1_shop', 'rev_mean_cat1_shop']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat1, City)`

In [14]:
feature_list = ['date_block_num', 'item_cat1', 'City']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_city', 'cnt_mean_cat1_city', 'price_mean_cat1_city', 'rev_mean_cat1_city']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat1, Type)`

In [15]:
feature_list = ['date_block_num', 'item_cat1', 'Type']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_type', 'cnt_mean_cat1_type', 'price_mean_cat1_type', 'rev_mean_cat1_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat2, shop)`

In [16]:
feature_list = ['date_block_num', 'item_cat2', 'shop_id']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat2_shop', 'cnt_mean_cat2_shop', 'price_mean_cat2_shop', 'rev_mean_cat2_shop']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat2, city)`

In [17]:
feature_list = ['date_block_num', 'item_cat2', 'City']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat2_city', 'cnt_mean_cat2_city', 'price_mean_cat2_city', 'rev_mean_cat2_city']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat2, type)`

In [18]:
feature_list = ['date_block_num', 'item_cat2', 'Type']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat2_type', 'cnt_mean_cat2_type', 'price_mean_cat2_type', 'rev_mean_cat2_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(cat1, cat2)`

In [19]:
feature_list = ['date_block_num', 'item_cat1', 'item_cat2']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_cat1_cat2', 'cnt_mean_cat1_cat2', 'price_mean_cat1_cat2', 'rev_mean_cat1_cat2']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

##### `(city, type)`

In [20]:
feature_list = ['date_block_num', 'City', 'Type']
function_list = {'cnt_sum_month':['count', 'mean'], 'price_mean_month':np.mean, 'rev_sum_month':np.mean}
column_name = ['shop_count_city_type', 'cnt_mean_city_type', 'price_mean_city_type', 'rev_mean_city_type']
cols.extend(column_name)
x, tmp = feature_combination(x, feature_list, function_list, column_name, merge=True)

#### Write a function for the analysis

In [21]:
x.tail()

Unnamed: 0,date_block_num,shop_id,item_id,City,Type,item_cat1,item_cat2,price_mean_month,price_std_month,cnt_sum_month,...,price_mean_cat2_type,rev_mean_cat2_type,shop_count_cat1_cat2,cnt_mean_cat1_cat2,price_mean_cat1_cat2,rev_mean_cat1_cat2,shop_count_city_type,cnt_mean_city_type,price_mean_city_type,rev_mean_city_type
1823319,34,45,18454,15,4,12,16,,,,...,,,0,,,,0,,,
1823320,34,45,16188,15,4,10,12,,,,...,,,0,,,,0,,,
1823321,34,45,15757,15,4,12,16,,,,...,,,0,,,,0,,,
1823322,34,45,19648,15,4,3,23,,,,...,,,0,,,,0,,,
1823323,34,45,969,15,4,3,9,,,,...,,,0,,,,0,,,


In [22]:
for c in tqdm.tqdm_notebook(cols):
    x = lag_features(x, range(1, 3), c)

HBox(children=(IntProgress(value=0, max=44), HTML(value='')))




In [25]:
cols.remove('cnt_sum_month')
x.drop(cols, axis=1, inplace=True)

#### CV

In [27]:
# Split train and test sets
x_train = x.loc[x['date_block_num']<=33, :].copy()
x_test = x.loc[x['date_block_num']==34, :].copy()

# Drop target from test set
x_test.drop(['cnt_sum_month'], axis=1, inplace=True)

# Split target from train set
# Note that target is first clipped to (0, 40), then clipped to (0, 20) in test set. 
# This is similar to the idea of calibration
y_train = x_train['cnt_sum_month'].clip(0, 40)
x_train.drop(['cnt_sum_month'], axis=1, inplace=True)

KeyError: "['cnt_sum_month'] not found in axis"

In [None]:
param = {'max_depth':5, 
         'subsample':0.9,
         'min_child_weight':3,
         'eta':0.3, 
         'lambda':2,
         'colsample_bytree':0.6,
         'seed':1,
         'silent':1,
         'maximize': False,
         'nthread':8}

n_tree = 2
verbose = True

n_split = 3
n_repetition = 2

df, clf, running_time = cv(param, n_repetition, n_split, n_tree, verbose, 42)