In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time

from sklearn.preprocessing import LabelEncoder
import scipy.stats as ss

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 
from numba import jit

%matplotlib inline

### Load data

In [10]:
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/items.csv')
item.drop(['item_name'], axis=1, inplace=True)

### Combinations of date, shop, and item

In [3]:
from itertools import product
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = []
for block_num in train['date_block_num'].unique():
    cur_shops = train.loc[train['date_block_num']==block_num, 'shop_id'].unique()
    cur_items = train.loc[train['date_block_num']==block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])), dtype=np.int32))
grid = pd.DataFrame(np.vstack(grid), columns=index_cols, dtype=np.int32)

### clean up

In [4]:
train = train[train.item_price<100000]
train = train[train.item_price<1000]

### Mean encodings

In [37]:
sales_m = train.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': np.sum, 
                                                                      'item_price': np.mean}).reset_index()

In [38]:
sales_m = pd.merge(grid, sales_m, on=['date_block_num', 'shop_id', 'item_id'], how='left').fillna(0.0)

KeyError: "['item_name'] not found in axis"

In [None]:
sales_m = pd.merge(sales_m, item, on=['item_id'], how='left')

In [None]:
for type_id in ['item_id', 'shop_id', 'item_category_id']:
    for column_id, aggregator, aggtype in [('item_price', np.mean, 'avg'), 
                                           ('item_cnt_day', np.sum, 'sum'), 
                                           ('item_cnt_day', np.mean, 'avg')]:
        mean_df = sales_m.groupby([type_id, 'date_block_num']).\
            aggregate(aggregator).reset_index()[[column_id, type_id, 'date_block_num']]
        mean_df.columns = [type_id+'_'+aggtype+'_'+column_id,type_id,'date_block_num']
        sales_m = pd.merge(sales_m, mean_df, on=['date_block_num', type_id], how='left')

In [36]:
sales_m.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_day,item_price,item_category_id,item_id_avg_item_price_x,item_id_avg_item_price_y,item_id_sum_item_cnt_day,item_id_avg_item_cnt_day,shop_id_avg_item_price,shop_id_sum_item_cnt_day,shop_id_avg_item_cnt_day,item_category_id_avg_item_price,item_category_id_sum_item_cnt_day,item_category_id_avg_item_cnt_day
0,59,22154,0,1.0,999.0,37,310.8,310.8,18.0,0.4,39.178139,1687.0,0.207887,53.041263,5819.0,0.190724
1,59,2552,0,0.0,0.0,58,19.977778,19.977778,0.0,0.0,39.178139,1687.0,0.207887,6.195011,51.0,0.00771
2,59,2554,0,0.0,0.0,58,0.0,0.0,0.0,0.0,39.178139,1687.0,0.207887,6.195011,51.0,0.00771
3,59,2555,0,0.0,0.0,56,0.0,0.0,0.0,0.0,39.178139,1687.0,0.207887,23.386519,197.0,0.036481
4,59,2564,0,0.0,0.0,59,36.4,36.4,5.0,0.111111,39.178139,1687.0,0.207887,22.932845,637.0,0.084764
