In [1]:
__author__ = "konwar.m"
__copyright__ = "Copyright 2022, AI R&D"
__credits__ = ["konwar.m"]
__license__ = "Individual Ownership"
__version__ = "1.0.1"
__maintainer__ = "konwar.m"
__email__ = "rickykonwar@gmail.com"
__status__ = "Development"

### Importing Libraries

In [2]:
import os
import copy
import pickle
import random
import numpy as np
import pandas as pd

from tqdm import tqdm
from datetime import timedelta
from sklearn import preprocessing

In [3]:
os.chdir('..')
os.getcwd()

'c:\\Users\\manash.jyoti.konwar\\Documents\\AI_Random_Projects\\ML-Retail-Sales'

### Loading Data

In [4]:
sales_train = pd.read_csv(r'datasets\sales_train.csv')
items = pd.read_csv(r'datasets\translated_items.csv')
shops = pd.read_csv(r'datasets\translated_shops.csv')
item_categories = pd.read_csv(r'datasets\translated_item_categories.csv')
test = pd.read_csv(r'datasets\test.csv')
sample_submission = pd.read_csv(r'datasets\sample_submission.csv')

### Aggregation of data

#### Getting week start dates

In [5]:
# Getting weekly dates instead of daily dates
train_data = sales_train.copy()
train_data['date'] = pd.to_datetime(train_data['date'], infer_datetime_format=True, format='%d.%m.%Y')
train_data['week_start_date'] = train_data.date.apply(lambda x: x - timedelta(days=x.weekday()))
train_data

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,week_start_date
0,2013-01-02,0,59,22154,999.00,1.0,2012-12-31
1,2013-01-03,0,25,2552,899.00,1.0,2012-12-31
2,2013-01-05,0,25,2552,899.00,-1.0,2012-12-31
3,2013-01-06,0,25,2554,1709.05,1.0,2012-12-31
4,2013-01-15,0,25,2555,1099.00,1.0,2013-01-14
...,...,...,...,...,...,...,...
2935844,2015-10-10,33,25,7409,299.00,1.0,2015-10-05
2935845,2015-10-09,33,25,7460,299.00,1.0,2015-10-05
2935846,2015-10-14,33,25,7459,349.00,1.0,2015-10-12
2935847,2015-10-22,33,25,7440,299.00,1.0,2015-10-19


In [6]:
test_data = test[['shop_id', 'item_id']].copy()
test_data['week_start_date'] = train_data.week_start_date.max() + timedelta(days=7)
test_data

Unnamed: 0,shop_id,item_id,week_start_date
0,5,5037,2015-11-02
1,5,5320,2015-11-02
2,5,5233,2015-11-02
3,5,5232,2015-11-02
4,5,5268,2015-11-02
...,...,...,...
214195,45,18454,2015-11-02
214196,45,16188,2015-11-02
214197,45,15757,2015-11-02
214198,45,19648,2015-11-02


#### Removing negative counter and aggregating at week level

In [7]:
# Removing all negative counters to 0
train_data['item_cnt_day'] = train_data['item_cnt_day'].apply(lambda x : x if x > 0 else 0)
# Aggregating at week level
train_data = train_data.groupby(['week_start_date', 'date_block_num', 'shop_id','item_id']).agg({'item_price':'mean', 'item_cnt_day':'sum'}).reset_index()
train_data

Unnamed: 0,week_start_date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2012-12-31,0,0,32,221.0,2.0
1,2012-12-31,0,0,33,347.0,1.0
2,2012-12-31,0,0,95,193.0,1.0
3,2012-12-31,0,0,98,268.0,5.0
4,2012-12-31,0,0,210,118.0,1.0
...,...,...,...,...,...,...
2360063,2015-10-26,33,59,20949,5.0,4.0
2360064,2015-10-26,33,59,21369,169.0,1.0
2360065,2015-10-26,33,59,21449,449.0,2.0
2360066,2015-10-26,33,59,21811,199.0,1.0


#### Extracting item category ids

In [8]:
# Merging category ids
train_data = pd.merge(train_data, items[['item_id', 'item_category_id']], how='left', on='item_id')
train_data

Unnamed: 0,week_start_date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
0,2012-12-31,0,0,32,221.0,2.0,40
1,2012-12-31,0,0,33,347.0,1.0,37
2,2012-12-31,0,0,95,193.0,1.0,40
3,2012-12-31,0,0,98,268.0,5.0,40
4,2012-12-31,0,0,210,118.0,1.0,45
...,...,...,...,...,...,...,...
2360063,2015-10-26,33,59,20949,5.0,4.0,71
2360064,2015-10-26,33,59,21369,169.0,1.0,40
2360065,2015-10-26,33,59,21449,449.0,2.0,40
2360066,2015-10-26,33,59,21811,199.0,1.0,37


In [9]:
test_data = pd.merge(test_data, items[['item_id', 'item_category_id']], how='left', on='item_id')
test_data

Unnamed: 0,shop_id,item_id,week_start_date,item_category_id
0,5,5037,2015-11-02,19
1,5,5320,2015-11-02,55
2,5,5233,2015-11-02,19
3,5,5232,2015-11-02,23
4,5,5268,2015-11-02,20
...,...,...,...,...
214195,45,18454,2015-11-02,55
214196,45,16188,2015-11-02,64
214197,45,15757,2015-11-02,55
214198,45,19648,2015-11-02,40


#### Adding other required columns and item counts

In [10]:
# Adding month block num or linear trend no
test_data['date_block_num'] = 34

# Adding item counts
test_data['item_cnt_day'] = -1

# Adding latest combination price of shop_id and item_id
latest_prices = train_data.sort_values('week_start_date').groupby(['shop_id', 'item_id']).tail(1).reset_index(drop=True)
test_data = pd.merge(test_data, latest_prices[['shop_id','item_id','item_price']], how='left', on=['shop_id','item_id'])

# Replacing nan item with latest price of each item_id
latest_item_prices = train_data.sort_values('week_start_date').groupby('item_id').tail(1).reset_index(drop=True)
latest_category_prices = train_data.sort_values('week_start_date').groupby('item_category_id').tail(1).reset_index(drop=True)
for row_index, row_data in tqdm(test_data.copy().iterrows(), total=test_data.shape[0]):
    if pd.isna(row_data.item_price):
        item_data = latest_item_prices.loc[latest_item_prices.item_id.isin([row_data.item_id])].reset_index(drop=True)
        if len(item_data)>0:
            test_data.iloc[row_index, test_data.columns.get_loc('item_price')] = item_data.item_price[0]
        else:
            item_cat_data = latest_category_prices.loc[latest_category_prices.item_category_id.isin([row_data.item_category_id])].reset_index(drop=True)
            if len(item_cat_data)>0:
                test_data.iloc[row_index, test_data.columns.get_loc('item_price')] = item_cat_data.item_price[0]

100%|██████████| 214200/214200 [03:09<00:00, 1132.60it/s]


In [11]:
test_data

Unnamed: 0,shop_id,item_id,week_start_date,item_category_id,date_block_num,item_cnt_day,item_price
0,5,5037,2015-11-02,19,34,-1,749.0
1,5,5320,2015-11-02,55,34,-1,299.0
2,5,5233,2015-11-02,19,34,-1,1199.0
3,5,5232,2015-11-02,23,34,-1,599.0
4,5,5268,2015-11-02,20,34,-1,299.0
...,...,...,...,...,...,...,...
214195,45,18454,2015-11-02,55,34,-1,99.0
214196,45,16188,2015-11-02,64,34,-1,1359.0
214197,45,15757,2015-11-02,55,34,-1,199.0
214198,45,19648,2015-11-02,40,34,-1,99.0


#### Arranging column order

In [12]:
designed_order = ['week_start_date', 'date_block_num', 'item_category_id', 'item_id', 'shop_id', 'item_price', 'item_cnt_day']
train_data = train_data[designed_order]
test_data = test_data[designed_order]

### Data Preprocessing

In [13]:
train_test_set = pd.concat([train_data, test_data], axis = 0)

# Use recent data
start_month_index = train_data.date_block_num.min()
end_month_index = train_data.date_block_num.max()

In [14]:
# Using Label Encoder to encode the item categories and use them with training set data
lb = preprocessing.LabelEncoder()
l_cat = list(item_categories.translated_item_category_name)

# Creation of Parent Category
item_categories['parent_category_name'] = item_categories['translated_item_category_name'].apply(lambda x: x.split('-')[0].strip().title())

item_categories['item_category_id_fix'] = lb.fit_transform(l_cat)
item_categories['item_category_name_fix'] = l_cat
train_test_set = train_test_set.merge(item_categories[['item_category_id', 'item_category_id_fix', 'parent_category_name']], on = 'item_category_id', how = 'left')
_ = train_test_set.drop(['item_category_id'], axis=1, inplace=True)
train_test_set.rename(columns = {'item_category_id_fix':'item_category_id'}, inplace = True)

_ = item_categories.drop(['item_category_id'],axis=1, inplace=True)
_ = item_categories.drop(['item_category_name'],axis=1, inplace=True)
_ = item_categories.drop(['translated_item_category_name'],axis=1, inplace=True)

item_categories.rename(columns = {'item_category_id_fix':'item_category_id'}, inplace = True)
item_categories.rename(columns = {'item_category_name_fix':'item_category_name'}, inplace = True)
item_categories = item_categories.drop_duplicates()
item_categories.index = np.arange(0, len(item_categories))
item_categories = item_categories.sort_values(by=['item_category_id']).reset_index(drop=True)

item_categories

Unnamed: 0,parent_category_name,item_category_id,item_category_name
0,Accessories,0,Accessories - PS2.
1,Accessories,1,Accessories - PS3.
2,Accessories,2,Accessories - PS4.
3,Accessories,3,Accessories - PSP.
4,Accessories,4,Accessories - Xbox 360
...,...,...,...
79,Programs,79,Programs - training
80,Service,80,Service
81,Service,81,Service - Tickets
82,Tickets (Digit),82,Tickets (digit)


In [15]:
item_categories.parent_category_name.unique()

array(['Accessories', 'Android Games', 'Books', 'Cinema',
       'Clean Media (Piece)', 'Clean Media (Spire)', 'Delivery Of Goods',
       'Film', 'Games', 'Gaming Consoles', 'Gifts', 'Mac Games', 'Music',
       'Pc', 'Pc Games', 'Payment Cards (Cinema, Music, Games)',
       'Payment Cards', 'Power Elements', 'Programs', 'Service',
       'Tickets (Digit)', 'Игры'], dtype=object)

In [16]:
# Parent Category mapping
parent_cat_counter = 0
parent_category_map = {}
for parent_cat in item_categories.parent_category_name.unique():
    if parent_cat not in parent_category_map:
        parent_category_map[parent_cat] = parent_cat_counter
        parent_cat_counter+=1
parent_category_map

{'Accessories': 0,
 'Android Games': 1,
 'Books': 2,
 'Cinema': 3,
 'Clean Media (Piece)': 4,
 'Clean Media (Spire)': 5,
 'Delivery Of Goods': 6,
 'Film': 7,
 'Games': 8,
 'Gaming Consoles': 9,
 'Gifts': 10,
 'Mac Games': 11,
 'Music': 12,
 'Pc': 13,
 'Pc Games': 14,
 'Payment Cards (Cinema, Music, Games)': 15,
 'Payment Cards': 16,
 'Power Elements': 17,
 'Programs': 18,
 'Service': 19,
 'Tickets (Digit)': 20,
 'Игры': 21}

In [17]:
# Removing categories with very less data (data points less than 10)
category_2_remove = []
for category in train_data.item_category_id.unique():
    data_shape = train_data.loc[train_data.item_category_id.isin([category])].shape
    if data_shape[0]<10:
        category_2_remove.append(category)

print(category_2_remove)

train_test_set = train_test_set.loc[~train_test_set.item_category_id.isin(category_2_remove)].sort_values(by=['week_start_date','item_category_id']).reset_index(drop=True)
train_test_set

[0, 1, 48, 10, 52, 53, 51, 27, 50]


Unnamed: 0,week_start_date,date_block_num,item_id,shop_id,item_price,item_cnt_day,item_category_id,parent_category_name
0,2012-12-31,0,13071,0,242.0,3.0,2,Accessories
1,2012-12-31,0,13071,1,242.0,4.0,2,Accessories
2,2012-12-31,0,13071,2,499.0,5.0,2,Accessories
3,2012-12-31,0,13071,4,499.0,1.0,2,Accessories
4,2012-12-31,0,13071,6,499.0,4.0,2,Accessories
...,...,...,...,...,...,...,...,...
2403504,2015-11-02,34,1866,45,1249.5,-1.0,83,Игры
2403505,2015-11-02,34,4842,45,4699.0,-1.0,83,Игры
2403506,2015-11-02,34,4356,45,2199.0,-1.0,83,Игры
2403507,2015-11-02,34,2839,45,2899.0,-1.0,83,Игры


In [18]:
# Clipping data within 0 and 20
train_test_set['item_cnt_day'] = train_test_set['item_cnt_day'].clip(0,20)
train_test_set.head()

Unnamed: 0,week_start_date,date_block_num,item_id,shop_id,item_price,item_cnt_day,item_category_id,parent_category_name
0,2012-12-31,0,13071,0,242.0,3.0,2,Accessories
1,2012-12-31,0,13071,1,242.0,4.0,2,Accessories
2,2012-12-31,0,13071,2,499.0,5.0,2,Accessories
3,2012-12-31,0,13071,4,499.0,1.0,2,Accessories
4,2012-12-31,0,13071,6,499.0,4.0,2,Accessories


### Feature Engineering

In [19]:
enable_shop_features, enable_price_lag_features, enable_parent_category_price_ratios, enable_seasonality_features, enable_item_category_price_ratios = True, True, True, True, True

#### Shop related features  
1. Extracting total no of items sold on each shop for each category  
2. Getting mean price of each category being sold in each shop 

In [20]:
# Extract Shop Level mean price for each category
shop_data = train_test_set.groupby(['shop_id','item_category_id','week_start_date']).agg({'item_cnt_day':'sum', 'item_price':'mean'}).reset_index()
shop_data.rename(columns={'item_cnt_day':'count_item_week_shop','item_price':'price_mean_week_category'}, inplace=True)
shop_data

Unnamed: 0,shop_id,item_category_id,week_start_date,count_item_week_shop,price_mean_week_category
0,0,2,2012-12-31,3.0,242.000000
1,0,2,2013-01-07,10.0,242.000000
2,0,2,2013-01-14,6.0,242.000000
3,0,2,2013-01-21,6.0,242.000000
4,0,2,2013-01-28,4.0,242.000000
...,...,...,...,...,...
197826,59,83,2015-09-28,4.0,3999.000000
197827,59,83,2015-10-12,1.0,3999.000000
197828,59,83,2015-10-19,6.0,5449.000000
197829,59,83,2015-10-26,5.0,2881.250000


In [21]:
if enable_shop_features:
    train_test_set = pd.merge(train_test_set, shop_data, how='left', on=['shop_id','item_category_id','week_start_date'])
train_test_set

Unnamed: 0,week_start_date,date_block_num,item_id,shop_id,item_price,item_cnt_day,item_category_id,parent_category_name,count_item_week_shop,price_mean_week_category
0,2012-12-31,0,13071,0,242.0,3.0,2,Accessories,3.0,242.000000
1,2012-12-31,0,13071,1,242.0,4.0,2,Accessories,4.0,242.000000
2,2012-12-31,0,13071,2,499.0,5.0,2,Accessories,5.0,499.000000
3,2012-12-31,0,13071,4,499.0,1.0,2,Accessories,1.0,499.000000
4,2012-12-31,0,13071,6,499.0,4.0,2,Accessories,4.0,499.000000
...,...,...,...,...,...,...,...,...,...,...
2403504,2015-11-02,34,1866,45,1249.5,0.0,83,Игры,0.0,2643.343333
2403505,2015-11-02,34,4842,45,4699.0,0.0,83,Игры,0.0,2643.343333
2403506,2015-11-02,34,4356,45,2199.0,0.0,83,Игры,0.0,2643.343333
2403507,2015-11-02,34,2839,45,2899.0,0.0,83,Игры,0.0,2643.343333


#### Price Lag Features  
1. Add 1, 4, 12, 24 week lag prices for each item  
2. Add 1, 4, 12, 24 week lag prices for each item sold from each shop

In [22]:
lag_period = [1,4,12,24]

In [23]:
df_item_price_lagged =  pd.DataFrame()

if enable_price_lag_features:
    # Generate lag prices for each item
    item_concat_data = train_test_set[['week_start_date', 'item_id', 'item_price']].copy()
    item_concat_data = item_concat_data.groupby(['week_start_date', 'item_id']).agg({'item_price':'mean'}).reset_index()
    item_concat_data.drop_duplicates(inplace=True)

    # Set index
    item_concat_data = item_concat_data.set_index(['week_start_date', 'item_id'])

    for lag_no in tqdm(lag_period, desc='Generating item price lags'):
        intermediate_item_concat_data = item_concat_data.copy()
        intermediate_item_concat_data = intermediate_item_concat_data.unstack().shift(lag_no)
        intermediate_item_concat_data = intermediate_item_concat_data.stack(dropna=False).reset_index()
        intermediate_item_concat_data.rename(columns={'item_price':'price_lag_item_'+str(lag_no)}, inplace=True)

        df_item_price_lagged = intermediate_item_concat_data.copy() if len(df_item_price_lagged)==0 else \
                            pd.merge(df_item_price_lagged, intermediate_item_concat_data[['week_start_date', 'item_id', 'price_lag_item_'+str(lag_no)]], how='left', on=['week_start_date', 'item_id'])

    # Merging with train test data
    train_test_set = pd.merge(train_test_set, df_item_price_lagged, how='left', on=['week_start_date','item_id']).reset_index(drop=True)

    # Replacing nan values with 0
    for lag_no in lag_period:
        train_test_set['price_lag_item_'+str(lag_no)] = train_test_set['price_lag_item_'+str(lag_no)].fillna(0)

train_test_set

Generating item price lags: 100%|██████████| 4/4 [00:08<00:00,  2.17s/it]


Unnamed: 0,week_start_date,date_block_num,item_id,shop_id,item_price,item_cnt_day,item_category_id,parent_category_name,count_item_week_shop,price_mean_week_category,price_lag_item_1,price_lag_item_4,price_lag_item_12,price_lag_item_24
0,2012-12-31,0,13071,0,242.0,3.0,2,Accessories,3.0,242.000000,0.0,0.0,0.0,0.000000
1,2012-12-31,0,13071,1,242.0,4.0,2,Accessories,4.0,242.000000,0.0,0.0,0.0,0.000000
2,2012-12-31,0,13071,2,499.0,5.0,2,Accessories,5.0,499.000000,0.0,0.0,0.0,0.000000
3,2012-12-31,0,13071,4,499.0,1.0,2,Accessories,1.0,499.000000,0.0,0.0,0.0,0.000000
4,2012-12-31,0,13071,6,499.0,4.0,2,Accessories,4.0,499.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2403504,2015-11-02,34,1866,45,1249.5,0.0,83,Игры,0.0,2643.343333,0.0,2499.0,0.0,1499.071429
2403505,2015-11-02,34,4842,45,4699.0,0.0,83,Игры,0.0,2643.343333,0.0,4699.0,0.0,0.000000
2403506,2015-11-02,34,4356,45,2199.0,0.0,83,Игры,0.0,2643.343333,0.0,1198.0,1198.0,1498.000000
2403507,2015-11-02,34,2839,45,2899.0,0.0,83,Игры,0.0,2643.343333,0.0,0.0,0.0,0.000000


In [24]:
df_item_shop_price_lagged =  pd.DataFrame()

if enable_price_lag_features:
    # Generate lag prices for each item
    item_shop_concat_data = train_test_set[['week_start_date', 'item_id', 'shop_id', 'item_price']].copy()
    item_shop_concat_data = item_shop_concat_data.groupby(['week_start_date', 'item_id', 'shop_id']).agg({'item_price':'mean'}).reset_index()
    item_shop_concat_data.drop_duplicates(inplace=True)

    # Set index
    item_shop_concat_data = item_shop_concat_data.set_index(['week_start_date', 'item_id', 'shop_id'])

    for lag_no in tqdm(lag_period, desc='Generating item-shop price lags'):
        intermediate_item_shop_concat_data = item_shop_concat_data.copy()
        intermediate_item_shop_concat_data = intermediate_item_shop_concat_data.unstack().shift(lag_no)
        intermediate_item_shop_concat_data = intermediate_item_shop_concat_data.stack(dropna=False).reset_index()
        intermediate_item_shop_concat_data.rename(columns={'item_price':'price_lag_item_shop_'+str(lag_no)}, inplace=True)

        df_item_shop_price_lagged = intermediate_item_shop_concat_data.copy() if len(df_item_shop_price_lagged)==0 else \
                            pd.merge(df_item_shop_price_lagged, intermediate_item_shop_concat_data[['week_start_date', 'shop_id', 'item_id', 'price_lag_item_shop_'+str(lag_no)]], how='left', on=['week_start_date', 'item_id', 'shop_id'])

    # Merging with train test data
    train_test_set = pd.merge(train_test_set, df_item_shop_price_lagged, how='left', on=['week_start_date', 'shop_id', 'item_id']).reset_index(drop=True)

    # Replacing nan values with 0
    for lag_no in lag_period:
        train_test_set['price_lag_item_shop_'+str(lag_no)] = train_test_set['price_lag_item_shop_'+str(lag_no)].fillna(0)

train_test_set

Generating item-shop price lags: 100%|██████████| 4/4 [01:57<00:00, 29.25s/it]


Unnamed: 0,week_start_date,date_block_num,item_id,shop_id,item_price,item_cnt_day,item_category_id,parent_category_name,count_item_week_shop,price_mean_week_category,price_lag_item_1,price_lag_item_4,price_lag_item_12,price_lag_item_24,price_lag_item_shop_1,price_lag_item_shop_4,price_lag_item_shop_12,price_lag_item_shop_24
0,2012-12-31,0,13071,0,242.0,3.0,2,Accessories,3.0,242.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00
1,2012-12-31,0,13071,1,242.0,4.0,2,Accessories,4.0,242.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00
2,2012-12-31,0,13071,2,499.0,5.0,2,Accessories,5.0,499.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00
3,2012-12-31,0,13071,4,499.0,1.0,2,Accessories,1.0,499.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00
4,2012-12-31,0,13071,6,499.0,4.0,2,Accessories,4.0,499.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2403504,2015-11-02,34,1866,45,1249.5,0.0,83,Игры,0.0,2643.343333,0.0,2499.0,0.0,1499.071429,1249.0,1499.0,5999.0,298.00
2403505,2015-11-02,34,4842,45,4699.0,0.0,83,Игры,0.0,2643.343333,0.0,4699.0,0.0,0.000000,3999.0,1399.0,99.0,1748.01
2403506,2015-11-02,34,4356,45,2199.0,0.0,83,Игры,0.0,2643.343333,0.0,1198.0,1198.0,1498.000000,1798.0,1399.0,1999.0,1299.00
2403507,2015-11-02,34,2839,45,2899.0,0.0,83,Игры,0.0,2643.343333,0.0,0.0,0.0,0.000000,2899.0,1598.0,5499.0,140.00


#### Generating Seasonality Index at Item Category Level

In [25]:
df_seasonality_index, df_week = pd.DataFrame(), pd.DataFrame()
if enable_seasonality_features:
    start_date = train_test_set.week_start_date.min()
    end_date = train_test_set.week_start_date.max()

    delta = end_date - start_date   # returns timedelta
    week_list = [start_date + timedelta(days=7*i) for i in range((delta.days//7) + 1)]

    # Extract Week level Characteristics
    df_week = pd.DataFrame([week_day for week_day in week_list], columns=['week_start_date']).reset_index()
    df_week = df_week.rename(columns={'index':'week_block_num'})
    df_week['week_block_num'] += 1 
    df_week = df_week[['week_start_date','week_block_num']]
    df_week['month'] = df_week['week_start_date'].apply(lambda x: x.month)
    df_week['year'] = df_week['week_start_date'].apply(lambda x: x.year)
    
    # Calculating seasonality for 3 years (from 01-07-2013 to 30-06-2015)
    df_week_seasonality = df_week.copy()
    df_week_seasonality = df_week_seasonality.loc[(df_week_seasonality.week_start_date>=pd.to_datetime('2013-07-01')) & (df_week_seasonality.week_start_date<=pd.to_datetime('2015-06-30'))]
    
    df_seasonality_index = pd.DataFrame(columns=['month','parent_category_name','item_cnt_day','seasonal_index'])

    df_demand = train_test_set.loc[~train_test_set.date_block_num.isin([34])][['week_start_date', 'item_cnt_day', 'parent_category_name']].reset_index(drop=True)
    df_demand = df_demand.groupby(['week_start_date','parent_category_name']).agg({'item_cnt_day':'sum'}).reset_index()
    for parent_cat in sorted(list(df_demand.parent_category_name.unique())):
        try:
            df_intermediate_demand = df_demand.loc[df_demand.parent_category_name.isin([parent_cat])].reset_index(drop=True)
            df_intermediate_demand = pd.merge(df_week_seasonality.copy(), df_intermediate_demand[['week_start_date','item_cnt_day']], how='left', on='week_start_date')
            df_intermediate_demand['item_cnt_day'] = df_intermediate_demand['item_cnt_day'].fillna(0)
            df_intermediate_demand = df_intermediate_demand.groupby('month').agg({'item_cnt_day':'mean'}).reset_index()
            df_intermediate_demand['seasonal_index'] = df_intermediate_demand['item_cnt_day']/df_intermediate_demand['item_cnt_day'].mean()
            df_intermediate_demand['parent_category_name'] = parent_cat
            df_intermediate_demand = df_intermediate_demand[['month','parent_category_name','item_cnt_day','seasonal_index']] 
        except Exception:
            continue
        finally:
            df_seasonality_index = pd.concat([df_seasonality_index, df_intermediate_demand], ignore_index=True)

df_seasonality_index

Unnamed: 0,month,parent_category_name,item_cnt_day,seasonal_index
0,1,Accessories,533.25,1.107703
1,2,Accessories,562.5,1.168464
2,3,Accessories,389.3,0.808681
3,4,Accessories,320.875,0.666544
4,5,Accessories,305.25,0.634086
...,...,...,...,...
247,8,Игры,0.0,0.0
248,9,Игры,56.1,0.462832
249,10,Игры,71.75,0.591946
250,11,Игры,127.875,1.054984


In [26]:
if enable_seasonality_features:
    train_test_set = pd.merge(train_test_set, df_week[['week_start_date','week_block_num', 'month']], how='left', on=['week_start_date'])
    train_test_set = pd.merge(train_test_set, df_seasonality_index[['month', 'parent_category_name', 'seasonal_index']], how='left', on=['month','parent_category_name'])
train_test_set

Unnamed: 0,week_start_date,date_block_num,item_id,shop_id,item_price,item_cnt_day,item_category_id,parent_category_name,count_item_week_shop,price_mean_week_category,...,price_lag_item_4,price_lag_item_12,price_lag_item_24,price_lag_item_shop_1,price_lag_item_shop_4,price_lag_item_shop_12,price_lag_item_shop_24,week_block_num,month,seasonal_index
0,2012-12-31,0,13071,0,242.0,3.0,2,Accessories,3.0,242.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.00,1,12,2.512872
1,2012-12-31,0,13071,1,242.0,4.0,2,Accessories,4.0,242.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.00,1,12,2.512872
2,2012-12-31,0,13071,2,499.0,5.0,2,Accessories,5.0,499.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.00,1,12,2.512872
3,2012-12-31,0,13071,4,499.0,1.0,2,Accessories,1.0,499.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.00,1,12,2.512872
4,2012-12-31,0,13071,6,499.0,4.0,2,Accessories,4.0,499.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.00,1,12,2.512872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2403504,2015-11-02,34,1866,45,1249.5,0.0,83,Игры,0.0,2643.343333,...,2499.0,0.0,1499.071429,1249.0,1499.0,5999.0,298.00,149,11,1.054984
2403505,2015-11-02,34,4842,45,4699.0,0.0,83,Игры,0.0,2643.343333,...,4699.0,0.0,0.000000,3999.0,1399.0,99.0,1748.01,149,11,1.054984
2403506,2015-11-02,34,4356,45,2199.0,0.0,83,Игры,0.0,2643.343333,...,1198.0,1198.0,1498.000000,1798.0,1399.0,1999.0,1299.00,149,11,1.054984
2403507,2015-11-02,34,2839,45,2899.0,0.0,83,Игры,0.0,2643.343333,...,0.0,0.0,0.000000,2899.0,1598.0,5499.0,140.00,149,11,1.054984


#### Generating Parent Category Price Ratios

In [27]:
df_parent_price_set = pd.DataFrame()
if enable_parent_category_price_ratios:
    df_parent_price_set = train_test_set.groupby(['week_start_date', 'parent_category_name']).agg({'item_price':'mean'}).reset_index()
    df_parent_price_set = df_parent_price_set.rename(columns={'item_price': 'mean_parent_category_price'})
df_parent_price_set

Unnamed: 0,week_start_date,parent_category_name,mean_parent_category_price
0,2012-12-31,Accessories,1189.343670
1,2012-12-31,Books,312.032164
2,2012-12-31,Cinema,321.476990
3,2012-12-31,Clean Media (Piece),33.125000
4,2012-12-31,Clean Media (Spire),213.750000
...,...,...,...
2293,2015-11-02,Pc Games,578.935831
2294,2015-11-02,Power Elements,145.830000
2295,2015-11-02,Programs,1623.641541
2296,2015-11-02,Service,1291.016832


In [28]:
if enable_parent_category_price_ratios:
    train_test_set = pd.merge(train_test_set, df_parent_price_set, how='left', on=['week_start_date', 'parent_category_name'])
    train_test_set['priceratio_parent_category'] = np.where((train_test_set['item_price']>0) & (train_test_set['mean_parent_category_price']>0), train_test_set['item_price']/train_test_set['mean_parent_category_price'], np.nan)
    train_test_set.drop(columns=['mean_parent_category_price'], inplace=True)
train_test_set

Unnamed: 0,week_start_date,date_block_num,item_id,shop_id,item_price,item_cnt_day,item_category_id,parent_category_name,count_item_week_shop,price_mean_week_category,...,price_lag_item_12,price_lag_item_24,price_lag_item_shop_1,price_lag_item_shop_4,price_lag_item_shop_12,price_lag_item_shop_24,week_block_num,month,seasonal_index,priceratio_parent_category
0,2012-12-31,0,13071,0,242.0,3.0,2,Accessories,3.0,242.000000,...,0.0,0.000000,0.0,0.0,0.0,0.00,1,12,2.512872,0.203474
1,2012-12-31,0,13071,1,242.0,4.0,2,Accessories,4.0,242.000000,...,0.0,0.000000,0.0,0.0,0.0,0.00,1,12,2.512872,0.203474
2,2012-12-31,0,13071,2,499.0,5.0,2,Accessories,5.0,499.000000,...,0.0,0.000000,0.0,0.0,0.0,0.00,1,12,2.512872,0.419559
3,2012-12-31,0,13071,4,499.0,1.0,2,Accessories,1.0,499.000000,...,0.0,0.000000,0.0,0.0,0.0,0.00,1,12,2.512872,0.419559
4,2012-12-31,0,13071,6,499.0,4.0,2,Accessories,4.0,499.000000,...,0.0,0.000000,0.0,0.0,0.0,0.00,1,12,2.512872,0.419559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2403504,2015-11-02,34,1866,45,1249.5,0.0,83,Игры,0.0,2643.343333,...,0.0,1499.071429,1249.0,1499.0,5999.0,298.00,149,11,1.054984,0.470724
2403505,2015-11-02,34,4842,45,4699.0,0.0,83,Игры,0.0,2643.343333,...,0.0,0.000000,3999.0,1399.0,99.0,1748.01,149,11,1.054984,1.770255
2403506,2015-11-02,34,4356,45,2199.0,0.0,83,Игры,0.0,2643.343333,...,1198.0,1498.000000,1798.0,1399.0,1999.0,1299.00,149,11,1.054984,0.828430
2403507,2015-11-02,34,2839,45,2899.0,0.0,83,Игры,0.0,2643.343333,...,0.0,0.000000,2899.0,1598.0,5499.0,140.00,149,11,1.054984,1.092141


#### Generating Item Category Price Ratios / Segregating training data based on Parent Categories

In [29]:
segregated_train_dict = {}
if enable_item_category_price_ratios:
    for parent_cat in train_test_set.parent_category_name.unique():
        try:
            print(parent_cat)

            # Extracting data with respect to item category
            df_main_item_cat_price_set = train_test_set.loc[train_test_set.parent_category_name.isin([parent_cat])].reset_index(drop=True)
            df_intermediate_item_cat_price_set = df_main_item_cat_price_set.copy()
            df_intermediate_item_cat_price_set = df_intermediate_item_cat_price_set.groupby(['week_start_date', 'item_category_id']).agg({'item_price':'mean'}).reset_index()
            df_intermediate_item_cat_price_set = df_intermediate_item_cat_price_set.rename(columns={'item_price': 'mean_item_category_price'})
            
            item_cat_id_list = sorted(list(df_intermediate_item_cat_price_set.item_category_id.unique()))

            # Initializing dataframe at parent cat level with all unique weeks
            week_list = sorted(list(df_intermediate_item_cat_price_set.week_start_date.unique()))
            df_item_cat_price_set = pd.DataFrame([[week_date] for week_date in week_list], columns=['week_start_date'])

            # Fill up the week level dates with item category prices
            for item_cat_id in item_cat_id_list:
                df_item_cat = df_intermediate_item_cat_price_set.loc[df_intermediate_item_cat_price_set.item_category_id.isin([item_cat_id])].reset_index(drop=True)
                df_item_cat_price_set = pd.merge(df_item_cat_price_set, df_item_cat[['week_start_date','mean_item_category_price']], how='left', on='week_start_date').rename(columns={'mean_item_category_price':'item_category_price_'+str(item_cat_id)})

            df_main_item_cat_price_set = pd.merge(df_main_item_cat_price_set, df_item_cat_price_set, how='left', on='week_start_date')

            # Fill up price ratios with competitor item category ids
            for item_cat_id in item_cat_id_list:
                df_main_item_cat_price_set['priceratio_item_category_'+str(item_cat_id)] = np.where((df_main_item_cat_price_set['item_price']>0) & (df_main_item_cat_price_set['item_category_price_'+str(item_cat_id)]>0), \
                                                                                                    df_main_item_cat_price_set['item_price']/df_main_item_cat_price_set['item_category_price_'+str(item_cat_id)], np.nan)

            # Drop all item category price based columns
            df_main_item_cat_price_set = df_main_item_cat_price_set.drop(['item_category_price_'+str(item_cat_id) for item_cat_id in item_cat_id_list], axis=1)               
            
            print(df_main_item_cat_price_set.shape)
        except Exception as ex:
            print('Caught Exception while generating train test set for parent category: %s' %(parent_cat))
            continue
        finally:
            segregated_train_dict[parent_cat] = df_main_item_cat_price_set


Accessories
(47427, 27)
Books
(69788, 34)
Cinema
(573284, 26)
Clean Media (Piece)
(2387, 23)
Clean Media (Spire)
(621, 23)
Film
(184489, 23)
Games
(411630, 29)
Gaming Consoles
(20081, 30)
Gifts
(188850, 29)
Music
(418428, 28)
Pc Games
(376393, 26)
Payment Cards
(21537, 26)
Power Elements
(5380, 23)
Programs
(57596, 28)
Service
(4471, 24)
Pc
(45, 23)
Payment Cards (Cinema, Music, Games)
(3708, 23)
Tickets (Digit)
(765, 23)
Игры
(16442, 23)
Mac Games
(50, 23)
Android Games
(137, 23)


### Train-Test Data Saving for different Parent Categories

#### Filtering out train-test set 
1. no of data points is very less  
2. excluding date_block_num 34 since those are testing period

In [30]:
if not os.path.exists(os.path.join('datasets', 'train_test_datasets_v02')):
    os.makedirs(os.path.join('datasets', 'train_test_datasets_v02'))

In [31]:
finalized_train_dict, finalized_test_dict = {}, {}

for parent_cat in tqdm(segregated_train_dict.keys(), desc='Filtering relevant parent category with sufficient data for training'):
    parent_data = segregated_train_dict.get(parent_cat)
    week_mask = parent_data.date_block_num.isin([34])
    parent_train_data = parent_data.loc[~week_mask].reset_index(drop=True)
    parent_test_data = parent_data.loc[week_mask].reset_index(drop=True)
    
    if parent_data.shape[0]>10:
        print('Parent Category::: %s' %(parent_cat))
        if parent_cat not in finalized_train_dict.keys():
            print('Size of train data::: %s' %(str(parent_train_data.shape)))
            finalized_train_dict[parent_cat] = parent_train_data

        if parent_cat not in finalized_test_dict.keys():
            print('Size of test data::: %s' %(str(parent_test_data.shape)))
            finalized_test_dict[parent_cat] = parent_test_data

Filtering relevant parent category with sufficient data for training:   0%|          | 0/21 [00:00<?, ?it/s]

Parent Category::: Accessories
Size of train data::: (44697, 27)
Size of test data::: (2730, 27)
Parent Category::: Books
Size of train data::: (54542, 34)
Size of test data::: (15246, 34)


Filtering relevant parent category with sufficient data for training:  14%|█▍        | 3/21 [00:00<00:01, 14.41it/s]

Parent Category::: Cinema
Size of train data::: (534140, 26)
Size of test data::: (39144, 26)
Parent Category::: Clean Media (Piece)
Size of train data::: (2387, 23)
Size of test data::: (0, 23)
Parent Category::: Clean Media (Spire)
Size of train data::: (621, 23)
Size of test data::: (0, 23)
Parent Category::: Film
Size of train data::: (170587, 23)
Size of test data::: (13902, 23)


Filtering relevant parent category with sufficient data for training:  48%|████▊     | 10/21 [00:00<00:00, 17.97it/s]

Parent Category::: Games
Size of train data::: (390798, 29)
Size of test data::: (20832, 29)
Parent Category::: Gaming Consoles
Size of train data::: (18905, 30)
Size of test data::: (1176, 30)
Parent Category::: Gifts
Size of train data::: (161214, 29)
Size of test data::: (27636, 29)
Parent Category::: Music
Size of train data::: (374664, 28)
Size of test data::: (43764, 28)


Filtering relevant parent category with sufficient data for training: 100%|██████████| 21/21 [00:00<00:00, 29.58it/s]

Parent Category::: Pc Games
Size of train data::: (356065, 26)
Size of test data::: (20328, 26)
Parent Category::: Payment Cards
Size of train data::: (20991, 26)
Size of test data::: (546, 26)
Parent Category::: Power Elements
Size of train data::: (5212, 23)
Size of test data::: (168, 23)
Parent Category::: Programs
Size of train data::: (49616, 28)
Size of test data::: (7980, 28)
Parent Category::: Service
Size of train data::: (4429, 24)
Size of test data::: (42, 24)
Parent Category::: Pc
Size of train data::: (3, 23)
Size of test data::: (42, 23)
Parent Category::: Payment Cards (Cinema, Music, Games)
Size of train data::: (3708, 23)
Size of test data::: (0, 23)
Parent Category::: Tickets (Digit)
Size of train data::: (765, 23)
Size of test data::: (0, 23)
Parent Category::: Игры
Size of train data::: (12452, 23)
Size of test data::: (3990, 23)
Parent Category::: Mac Games
Size of train data::: (8, 23)
Size of test data::: (42, 23)
Parent Category::: Android Games
Size of train da




In [32]:
finalized_train_dict.keys()

dict_keys(['Accessories', 'Books', 'Cinema', 'Clean Media (Piece)', 'Clean Media (Spire)', 'Film', 'Games', 'Gaming Consoles', 'Gifts', 'Music', 'Pc Games', 'Payment Cards', 'Power Elements', 'Programs', 'Service', 'Pc', 'Payment Cards (Cinema, Music, Games)', 'Tickets (Digit)', 'Игры', 'Mac Games', 'Android Games'])

In [33]:
finalized_test_dict.keys()

dict_keys(['Accessories', 'Books', 'Cinema', 'Clean Media (Piece)', 'Clean Media (Spire)', 'Film', 'Games', 'Gaming Consoles', 'Gifts', 'Music', 'Pc Games', 'Payment Cards', 'Power Elements', 'Programs', 'Service', 'Pc', 'Payment Cards (Cinema, Music, Games)', 'Tickets (Digit)', 'Игры', 'Mac Games', 'Android Games'])

#### Saving training data and essentials

In [34]:
# Saving the train datasets at parent category level
for parent_cat in tqdm(finalized_train_dict.keys(), desc='Saving Training data for parent categories'):
    training_data = finalized_train_dict.get(parent_cat)
    training_data.to_csv(os.path.join('datasets', 'train_test_datasets_v02', 'train_'+str(parent_category_map.get(parent_cat))+'_data.csv'), index=False)

Saving Training data for parent categories: 100%|██████████| 21/21 [01:32<00:00,  4.40s/it]


In [35]:
# Saving parent category map for training pipeline
with open(os.path.join('datasets', 'train_test_datasets_v02', 'parent_cat_map.pkl'), "wb") as fp:
    pickle.dump(parent_category_map, fp)

In [36]:
# Saving seasonality data
df_seasonality_index.to_csv(os.path.join('datasets', 'train_test_datasets_v02', 'df_seasonality.csv'), index=False)

#### Saving test data 

In [37]:
for parent_cat in tqdm(finalized_test_dict.keys(), desc='Saving Testing data for parent categories'):
    testing_data = finalized_test_dict.get(parent_cat)
    testing_data.to_csv(os.path.join('datasets', 'train_test_datasets_v02', 'test_'+str(parent_category_map.get(parent_cat))+'_data.csv'), index=False)

Saving Testing data for parent categories: 100%|██████████| 21/21 [00:08<00:00,  2.34it/s]
