In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time
import gc
import itertools
import tqdm
import time
import copy

import scipy.stats as ss
from numba import jit

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

%matplotlib inline

### Helper Function

In [2]:
def lag_features(df, lags, merge_cols, shift_cols, fillna_value=None):
    '''create lag features of col'''
    cols = copy.copy(merge_cols)
    cols.extend(shift_cols)
    tmp = df.loc[:, cols]
    for i in lags:
        shifted = tmp.copy()
        shifted_cols = shifted.columns.tolist()
        shifted_cols = [c+'_lag_'+str(i) if c in shift_cols else c 
                        for c in shifted_cols]
        shifted.columns = shifted_cols
        shifted[merge_cols] += i
        shifted.drop_duplicates(inplace=True)
        df = pd.merge(df, shifted, on=merge_cols, how='left')
    if fillna_value is not None:
        df.fillna(fillna_value, inplace=True)
    return df    

In [3]:
def clip_rmse(preds, dtrain):
    y_test = np.array(dtrain.get_label())
    preds = np.array(preds)
    y_test = np.maximum(np.minimum(y_test, 20), 0)
    preds = np.maximum(np.minimum(preds, 20), 0)
    #preds = np.array(list(map(lambda x: min(20, max(x, 0)), list(preds))))
    #y_test = np.array(list(map(lambda x: min(20, max(x, 0)), list(y_test))))
    rmse = np.sqrt(mean_squared_error(preds,y_test))
    return 'clip-rmse', rmse

In [4]:
def cv(x_train, y_train, param, n_repetition, n_split, n_tree, verbose, random_state):
    '''Repeated CV'''
    
    cv_results = {}
    clf = {}
    running_time = {}

    np.random.seed(random_state)

    for m in range(n_repetition):
        # Train and valuation sets split
        skf = StratifiedKFold(n_splits=n_split, random_state=np.random.randint(10**6), shuffle=True)

        for n, (train_index, val_index) in enumerate(skf.split(x_train, y_train)):

            start_time = time.time()

            # Construct DMatrix
            dtrain = xgb.DMatrix(x_train.iloc[train_index], label=y_train.iloc[train_index])
            dval = xgb.DMatrix(x_train.iloc[val_index], label=y_train.iloc[val_index])

            # Placeholder for evals_results
            cv_results[m, n] = {}

            param['seed'] = np.random.randint(10**6)
            clf[m, n] = xgb.train(param, dtrain,num_boost_round=n_tree, 
                                  evals=[(dtrain, 'train'), (dval, 'val')], 
                                  feval=clip_rmse, maximize=False, early_stopping_rounds=None, 
                                  evals_result=cv_results[m, n], verbose_eval=verbose)

            running_time[m, n] = time.time() - start_time

            print('Repeat {}, split {}, val score = {:.3f}, running time = {:.3f} min.'.format(m, n, 
                cv_results[m, n]['val']['clip-rmse'][-1], running_time[m, n]/60))

    cv_results_final = {}
    for m in range(n_repetition):
        for n in range(n_split):
            cv_results_final['train', m, n] = cv_results[m, n]['train']['clip-rmse']
            cv_results_final['val', m, n] = cv_results[m, n]['val']['clip-rmse']

    df = pd.DataFrame(cv_results_final)
    df.index.name = 'iteration'
    df.columns.names = ['dataset', 'repetition', 'cv_split']

    print('Val mean = {:.3f}, std = {:.3f}'.format(df['val'].iloc[-1].mean(), df['val'].iloc[-1].std()))
    
    return df, clf, running_time

In [5]:
def feature_combination(x, feature_list, function_dict, column_name, merge=False):
    '''Combination of new features'''
    tmp = x.groupby(feature_list).agg(function_dict)
    tmp.columns = column_name
    if merge:
        x = x.merge(tmp, on=feature_list, how='left')
        return x, tmp
    else:
        return tmp

### Load and Preprocess Data

In [6]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

test.set_index('ID', inplace=True)
item.drop(['item_name_translated'], axis=1, inplace=True)
shop.drop(['Name'], axis=1, inplace=True)

le = LabelEncoder()
item['item_cat1'] = le.fit_transform(item['item_cat1'].astype(str))
item['item_cat2'] = le.fit_transform(item['item_cat2'].astype(str))
shop['City'] = le.fit_transform(shop['City'])
shop['Type'] = le.fit_transform(shop['Type'])

Remove outliers

In [7]:
train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]
median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()
train.loc[train.item_price<0, 'item_price'] = median

Fix shop names and ids

In [8]:
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

Monthly sales for all the samples

In [9]:
x = pd.DataFrame(train.groupby(['date_block_num', 'shop_id', 'item_id'])['item_cnt_day'].sum())

x.reset_index(inplace=True)

Training set only contains sold samples, need to extend to all samples.

There are two ways of extending:
1. overall product between elements in (month, shop_id, item_id)
2. in each month, the product between elements in (shop_id, item_id)

The first one increases the number of rows by 23.5 times, the second one 6 times

In [10]:
shop_count = {}
item_count = {}
product_count = {}
sample_count = {}
ratio = {}

for n in x.date_block_num.unique():
    shop_count[n] = len(x.loc[x.date_block_num==n, 'shop_id'].unique())
    item_count[n] = len(x.loc[x.date_block_num==n, 'item_id'].unique())
    sample_count[n] = len(x.loc[x.date_block_num==n, :])
    product_count[n] = shop_count[n]*item_count[n]
    ratio[n] = product_count[n]/sample_count[n]
#     print('product count is {}, sample count is {}, ratio is {:.3f}'.format(product_count[n], 
#                                                                             sample_count[n], 
#                                                                             product_count[n]/sample_count[n]))
    
print('overall product count is {}, sample count is {}, ratio is {:.3f}'.format(
    len(x.shop_id.unique())*len(x.item_id.unique())*34, 
    x.shape[0], 34*len(x.shop_id.unique())*len(x.item_id.unique())/x.shape[0]))
print('monthly product count sum is {}, monthly sample count is {}, ratio is {:.3f}'.format(
    sum(product_count.values()), sum(sample_count.values()), sum(product_count.values())/sum(sample_count.values())))

overall product count is 42260028, sample count is 1609123, ratio is 26.263
monthly product count sum is 10913804, monthly sample count is 1609123, ratio is 6.782


Try the first extending method

In [11]:
month = np.arange(0, 34)
shop_list = train.shop_id.unique().tolist()
item_list = train.item_id.unique().tolist()
n_rows = len(month)*len(shop_list)*len(item_list)

idx = pd.MultiIndex.from_product([month, shop_list, item_list], names=['date_block_num', 'shop_id', 'item_id'])

x2 = pd.DataFrame(np.zeros((n_rows,2)), index=idx)
x2.reset_index(inplace=True, drop=False)
x2.drop([0, 1], axis=1, inplace=True)

In [12]:
x = x2.merge(x, on=['date_block_num', 'shop_id', 'item_id'], how='outer').fillna(0.0)
test['date_block_num'] = 34
x = pd.concat((x, test), sort=False).fillna(0.0)

del x2
gc.collect()

98

Add shop/item features

In [17]:
x = pd.merge(x, shop, on='shop_id', how='left')
x = pd.merge(x, item, on='item_id', how='left')

Change column names

In [20]:
x.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_day,City,Type,item_cat1,item_cat2
0,0,59,22154,1.0,27,1,3,9
1,0,59,2552,0.0,27,1,12,56
2,0,59,2554,0.0,27,1,12,56
3,0,59,2555,0.0,27,1,12,15
4,0,59,2564,0.0,27,1,12,38


In [21]:
x.columns = ['month', 'shop_id', 'item_id', 'sales_month', 
             'City', 'Type', 'item_cat1', 'item_cat2']

Change column types

In [23]:
x['City'] = x['City'].astype(np.int8)
x['Type'] = x['Type'].astype(np.int8)
x['item_cat1'] = x['item_cat1'].astype(np.int8)
x['item_cat2'] = x['item_cat2'].astype(np.int8)
x['sales_month'] = x['sales_month'].astype(np.float16)
x['month'] = x['month'].astype(np.int8)
x['shop_id'] = x['shop_id'].astype(np.int8)

### Target Lags

In [25]:
x = lag_features(x, [1, 2, 3, 6, 12], 
                 ['month', 'shop_id', 'item_id'], 
                 ['sales_month'], fillna_value=0)

### Mean Encoding

`sales_mean_month`

In [26]:
group = x.groupby('month').agg({'sales_month': ['mean']})
group.columns = ['sales_mean_month']
group.reset_index(inplace=True)

x = pd.merge(x, group, on=['month'], how='left')
x['sales_mean_month'] = x['sales_mean_month'].astype(np.float16)
x = lag_features(x, [1], ['month'], ['sales_mean_month'], fillna_value=0.0)
x.drop(['sales_mean_month'], axis=1, inplace=True)

`sales_mean_month_item`

In [27]:
col = 'sales_mean_month_item'
group = x.groupby(['month', 'item_id']).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=['month', 'item_id'], how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1, 2, 3, 6, 12], 
                 ['month', 'item_id'], 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_shop`

In [28]:
col = 'sales_mean_month_shop'
merge_cols = ['month', 'shop_id']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1, 2, 3, 6, 12], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_cat1`

In [29]:
col = 'sales_mean_month_cat1'
merge_cols = ['month', 'item_cat1']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_shop_cat1`

In [30]:
col = 'sales_mean_month_shop_cat1'
merge_cols = ['month', 'shop_id', 'item_cat1']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_shop_type`

In [31]:
col = 'sales_mean_month_shop_type'
merge_cols = ['month', 'shop_id', 'Type']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_shop_cat2`

In [32]:
col = 'sales_mean_month_shop_cat2'
merge_cols = ['month', 'shop_id', 'item_cat2']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_city`

In [33]:
col = 'sales_mean_month_city'
merge_cols = ['month', 'City']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_item_city`

In [34]:
col = 'sales_mean_month_item_city'
merge_cols = ['month', 'item_id', 'City']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_cat2`

In [35]:
col = 'sales_mean_month_cat2'
merge_cols = ['month', 'item_cat2']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

`sales_mean_month_cat1_cat2`

In [36]:
col = 'sales_mean_month_cat1_cat2'
merge_cols = ['month', 'item_cat1', 'item_cat2']
group = x.groupby(merge_cols).agg({'sales_month': ['mean']})
group.columns = [col]
group.reset_index(inplace=True)

x = pd.merge(x, group, on=merge_cols, how='left')
x[col] = x[col].astype(np.float16)
x = lag_features(x, [1, 2, 3, 6], 
                 merge_cols, 
                 [col], fillna_value=0.0)
x.drop([col], axis=1, inplace=True)

#### CV

In [37]:
x.replace([np.inf, -np.inf], np.nan, inplace=True)
x.fillna(0.0, inplace=True)

In [None]:
x0 = x.sample(0.)

In [38]:
# Split train and test sets
x_train = x.loc[(x['month']<=33) & (x['month']>=12), :].copy()
x_test = x.loc[x['month']==34, :].copy()

# Drop target from test set
x_test.drop(['month'], axis=1, inplace=True)

# Split target from train set
# Note that target is first clipped to (0, 40), then clipped to (0, 20) in test set. 
# This is similar to the idea of calibration
y_train = x_train['sales_month'].clip(0, 20)
x_train.drop(['sales_month'], axis=1, inplace=True)

In [40]:
param = {'max_depth':8, 
         'subsample':0.8,
         'min_child_weight':300,
         'eta':0.3, 
         'lambda':2,
         'colsample_bytree':0.8,
         'seed':1,
         'silent':1,
         'maximize': False,
         'nthread':8}

n_tree = 80
verbose = True

n_split = 3
n_repetition = 1

df, clf, running_time = cv(x_train, y_train, param, n_repetition, n_split, n_tree, verbose, 42)

  return umr_sum(a, axis, dtype, out, keepdims, initial)


MemoryError: 

In [55]:
a = {k: clf[0, 0].get_score(importance_type='gain') for k in clf.keys()}

In [56]:
b = pd.DataFrame(a)

In [57]:
b.columns = list(range(n_split))

In [58]:
c = b.mean(axis=1).sort_values(ascending=False)

In [59]:
d = c.head(50).index.tolist()
e = []
f = []
for n in d:
    if len(n.split('_'))>=2 and n.split('_')[-2] == 'lag':
        e.append('_'.join(n.split('_')[:-2]))
        f.append(int(n.split('_')[-1]))
        
e = set(e)
f = set(f)

In [60]:
e

{'cnt_mean_cat1_cat2',
 'cnt_mean_cat1_shop',
 'cnt_mean_cat1_type',
 'cnt_mean_cat2_type',
 'cnt_mean_item_city',
 'cnt_sum_month',
 'price_mean_cat1_cat2',
 'price_mean_cat1_type',
 'price_mean_item_city',
 'price_mean_month',
 'rev_mean_cat1_cat2',
 'rev_mean_cat1_city',
 'rev_mean_cat1_type',
 'rev_mean_cat2_type',
 'rev_mean_item_city',
 'rev_mean_item_type',
 'rev_sum_month',
 'shop_count_cat1_cat2'}

In [61]:
f

{1, 2, 3, 4, 5, 6}

In [62]:
26*10

260

In [64]:
b.to_csv('eda_11_5_feature_importance.csv')