In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time

from sklearn.preprocessing import LabelEncoder
import scipy.stats as ss

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 
from numba import jit

%matplotlib inline

### Load data

In [2]:
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/items.csv')
item.drop(['item_name'], axis=1, inplace=True)

### Combinations of date, shop, and item

In [3]:
from itertools import product
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = []
for block_num in train['date_block_num'].unique():
    cur_shops = train.loc[train['date_block_num']==block_num, 'shop_id'].unique()
    cur_items = train.loc[train['date_block_num']==block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])), dtype=np.int32))
grid = pd.DataFrame(np.vstack(grid), columns=index_cols, dtype=np.int32)

### clean up

In [4]:
train = train[train.item_price<100000]
train = train[train.item_price<1000]

### Mean encodings

In [5]:
sales_m = train.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': np.sum, 
                                                                      'item_price': np.mean}).reset_index()

In [6]:
sales_m = pd.merge(grid, sales_m, on=['date_block_num', 'shop_id', 'item_id'], how='left').fillna(0.0)

In [7]:
sales_m = pd.merge(sales_m, item, on=['item_id'], how='left')

In [8]:
for type_id in ['item_id', 'shop_id', 'item_category_id']:
    for column_id, aggregator, aggtype in [('item_price', np.mean, 'avg'), 
                                           ('item_cnt_day', np.sum, 'sum'), 
                                           ('item_cnt_day', np.mean, 'avg')]:
        mean_df = sales_m.groupby([type_id, 'date_block_num']).\
            aggregate(aggregator).reset_index()[[column_id, type_id, 'date_block_num']]
        mean_df.columns = [type_id+'_'+aggtype+'_'+column_id,type_id,'date_block_num']
        sales_m = pd.merge(sales_m, mean_df, on=['date_block_num', type_id], how='left')

In [9]:
sales_m.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_day,item_price,item_category_id,item_id_avg_item_price,item_id_sum_item_cnt_day,item_id_avg_item_cnt_day,shop_id_avg_item_price,shop_id_sum_item_cnt_day,shop_id_avg_item_cnt_day,item_category_id_avg_item_price,item_category_id_sum_item_cnt_day,item_category_id_avg_item_cnt_day
0,59,22154,0,1.0,999.0,37,310.8,18.0,0.4,39.178139,1687.0,0.207887,53.041263,5819.0,0.190724
1,59,2552,0,0.0,0.0,58,19.977778,0.0,0.0,39.178139,1687.0,0.207887,6.195011,51.0,0.00771
2,59,2554,0,0.0,0.0,58,0.0,0.0,0.0,39.178139,1687.0,0.207887,6.195011,51.0,0.00771
3,59,2555,0,0.0,0.0,56,0.0,0.0,0.0,39.178139,1687.0,0.207887,23.386519,197.0,0.036481
4,59,2564,0,0.0,0.0,59,36.4,5.0,0.111111,39.178139,1687.0,0.207887,22.932845,637.0,0.084764


### Lag features

In [24]:
lag_variables = list(sales_m.columns[7:])+['item_cnt_day']
lags = [1, 2, 3, 4, 5, 12]
for lag in lags:
    sales_new_df = sales_m.copy()
    sales_new_df.date_block_num += lag
    sales_new_df = sales_new_df[['date_block_num', 'shop_id', 'item_id']+lag_variables]
    sales_new_df.columns = ['date_block_num', 'shop_id', 'item_id'] + [lag_feat+'_lag_'+str(lag) for lag_feat in lag_variables]
    sales_m = pd.merge(sales_m, sales_new_df, on=['date_block_num', 'shop_id', 'item_id'], how='left')

### Fillna

In [28]:
for feat in sales_m.columns:
    if 'item_cnt' in feat:
        sales_m[feat] = sales_m[feat].fillna(0.0)
    elif 'item_price' in feat:
        sales_m[feat] = sales_m[feat].fillna(sales_m[feat].median())

### Drop columns that will be used

In [34]:
cols_to_drop = lag_variables[:-1]+['item_price']

In [35]:
cols_to_drop

['item_id_sum_item_cnt_day',
 'item_id_avg_item_cnt_day',
 'shop_id_avg_item_price',
 'shop_id_sum_item_cnt_day',
 'shop_id_avg_item_cnt_day',
 'item_category_id_avg_item_price',
 'item_category_id_sum_item_cnt_day',
 'item_category_id_avg_item_cnt_day',
 'item_price']

### Recent data only

In [31]:
sales_m = sales_m[sales_m['date_block_num']>12]

### Split into train and CV

In [38]:
x_train = sales_m[sales_m['date_block_num']<33].drop(cols_to_drop, axis=1)
x_cv =  sales_m[sales_m['date_block_num']==33].drop(cols_to_drop, axis=1)

### Clip [0, 40]

In [39]:
def clip(x):
    if x>40:
        return 40
    elif x<0:
        return 0
    else:
        return x
x_train['item_cnt_day'] = x_train.apply(lambda x: clip(x['item_cnt_day']),axis=1)
x_cv['item_cnt_day'] = x_cv.apply(lambda x: clip(x['item_cnt_day']),axis=1)

In [41]:
sales_m.date_block_num.unique()

array([13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
       30, 31, 32, 33], dtype=int64)

In [43]:
sales_m.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_day,item_price,item_category_id,item_id_avg_item_price,item_id_sum_item_cnt_day,item_id_avg_item_cnt_day,shop_id_avg_item_price,...,item_cnt_day_lag_5,item_id_sum_item_cnt_day_lag_12,item_id_avg_item_cnt_day_lag_12,shop_id_avg_item_price_lag_12,shop_id_sum_item_cnt_day_lag_12,shop_id_avg_item_cnt_day_lag_12,item_category_id_avg_item_price_lag_12,item_category_id_sum_item_cnt_day_lag_12,item_category_id_avg_item_cnt_day_lag_12,item_cnt_day_lag_12
4836102,27,15242,13,2.0,699.0,63,100.76087,8.0,0.173913,68.322742,...,0.0,0.0,0.0,39.612071,0.0,0.0,39.604979,0.0,0.0,0.0
4836103,27,15200,13,1.0,299.0,69,6.5,1.0,0.021739,68.322742,...,0.0,0.0,0.0,39.612071,0.0,0.0,39.604979,0.0,0.0,0.0
4836104,27,15279,13,2.0,799.0,63,382.130435,48.0,1.043478,68.322742,...,0.0,0.0,0.0,39.612071,0.0,0.0,39.604979,0.0,0.0,0.0
4836105,27,15202,13,1.0,299.0,69,23.847826,3.0,0.065217,68.322742,...,0.0,5.0,0.108696,67.775807,3515.0,0.430338,91.566044,498.0,0.23535,0.0
4836106,27,14888,13,1.0,549.0,55,165.652174,19.0,0.413043,68.322742,...,5.0,0.0,0.0,39.612071,0.0,0.0,39.604979,0.0,0.0,0.0
