In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import sys
import time

from sklearn.preprocessing import LabelEncoder
import scipy.stats as ss

if os.name=='nt':
    try:
        mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
        os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
    except:
        pass
    
import xgboost as xgb
from sklearn.metrics import mean_squared_error 
from numba import jit

%matplotlib inline

### Load data

In [9]:
# Load data
train = pd.read_csv('all/sales_train.csv.gz')
test = pd.read_csv('all/test.csv.gz')
shop = pd.read_csv('all/shops-translated.csv')
item = pd.read_csv('all/item_category.csv')

test.set_index('ID', inplace=True)
item.drop(['item_name_translated'], axis=1, inplace=True)
shop.drop(['Name'], axis=1, inplace=True)

le = LabelEncoder()
item['item_cat1'] = le.fit_transform(item['item_cat1'].astype(str))
item['item_cat2'] = le.fit_transform(item['item_cat2'].astype(str))
shop['City'] = le.fit_transform(shop['City'])
shop['Type'] = le.fit_transform(shop['Type'])

shop.set_index('shop_id', inplace=True)
item.set_index('item_id', inplace=True)

### `cnt`

In [41]:
feature_dict = {} # dict containing all the features

In [42]:
# Total count of sales per month
train_p = train.pivot_table(index=['shop_id', 'item_id'],
                           columns='date_block_num',
                           values='item_cnt_day',
                           aggfunc=np.sum).fillna(0.0)
# Rename columns
train_p.columns = ['cnt_'+str(k) for k in train_p.columns.get_level_values(0)]
# Add one column for test data
train_p['cnt_34'] = np.zeros((len(train_p), 1))
# Add to the feature dict
feature_dict['cnt'] = train_p

### `sales`

In [45]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [46]:
train['sales'] = train['item_price']*train['item_cnt_day']

In [51]:
# Total value of sales per month
sales = train.pivot_table(index=['shop_id', 'item_id'],
                         columns='date_block_num', 
                         values='sales',
                         aggfunc=np.sum).fillna(0.0)
# Rename columns
sales.columns = ['sales_'+str(k) for k in sales.columns.get_level_values(0)]
# Add on column for test data
sales['sales_34'] = np.zeros((len(sales), 1))
# Add to the feature dict
feature_dict['sales'] = sales

### explore

In [57]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,sales
0,02.01.2013,0,59,22154,999.0,1.0,999.0
1,03.01.2013,0,25,2552,899.0,1.0,899.0
2,05.01.2013,0,25,2552,899.0,-1.0,-899.0
3,06.01.2013,0,25,2554,1709.05,1.0,1709.05
4,15.01.2013,0,25,2555,1099.0,1.0,1099.0


In [58]:
shop.head()

Unnamed: 0_level_0,City,Type
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26,3
1,26,4
2,0,4
3,1,5
4,23,4


In [59]:
x = train.join(shop, how='left')

In [61]:
x = x.join(item, how='left')

In [62]:
x.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,sales,City,Type,item_cat1,item_cat2
0,02.01.2013,0,59,22154,999.0,1.0,999.0,26.0,3.0,3.0,23.0
1,03.01.2013,0,25,2552,899.0,1.0,899.0,26.0,4.0,20.0,32.0
2,05.01.2013,0,25,2552,899.0,-1.0,-899.0,0.0,4.0,3.0,23.0
3,06.01.2013,0,25,2554,1709.05,1.0,1709.05,1.0,5.0,3.0,23.0
4,15.01.2013,0,25,2555,1099.0,1.0,1099.0,23.0,4.0,3.0,23.0


In [65]:
a = x.pivot_table(index=['item_id'],
                 columns='date_block_num',
                 values='item_price',
                 aggfunc=np.mean).fillna(0.0)

In [66]:
a.head(2)

date_block_num,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Apply pivot table to x with combinations of the following parameters
index_list = ['item_id', 'shop_id', 'City', 'Type', 'item_cat1', 'item_cat2']
column_list = ['date_block_num']
value_list = ['item_price', 'item_cnt_day', 'sales']
aggfunc_list = [np.sum, np.mean]