# Generate aggregated features

# TOC

* [1 Loading the data](#1-Loading-the-data)
* [2 Cleaning of the dataset](#2-Cleaning-of-the-dataset)
* [3 Making the base set](#3-Making-the-base-set)
* [4 Generating aggregated features](#4-Generating-aggregated-features)
* [5 Aggregate EDA](#5-Aggregate-EDA)
* [6 Adding mean encoding](6-Adding-mean-encoding)
* [7 Merging the aggregated features with the data dataframe](7-Merging-the-aggregated-features-with-the-data-dataframe)
* [8 Adding temporal history](8-Adding-temporal-history)
* [9 Storing the data frame](9-Storing-the-data-frame)

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
import gc
import pandas as pd
import numpy as np
from pathlib import Path
from itertools import product
from sklearn import model_selection

# 1 Loading the data

In [None]:
data_dir = Path('.').absolute().joinpath('data')

sales_train = pd.read_csv(data_dir.joinpath('sales_train.csv.gz'))
sales_test = pd.read_csv(data_dir.joinpath('test.csv.gz'))
items = pd.read_csv(data_dir.joinpath('items.csv'))
item_categories = pd.read_csv(data_dir.joinpath('item_categories.csv'))
shops = pd.read_csv(data_dir.joinpath('shops.csv'))

In [None]:
n_train_samples = sales_train.shape[0]

Cast the dates to actual dates for easier manipulation

In [None]:
sales_train.loc[:, 'date'] = pd.to_datetime(sales_train.loc[:, 'date'], format='%d.%m.%Y')

# 2 Cleaning of the dataset

After investigating the item count per day outliners we saw that these may actual be correct (and not arising from typos etc.)

Further, we saw that the outliner in price could be fixed, by converting `'Radmin 3  - 522 лиц.'` to `'Radmin 3  - 1 лиц.'`.

We do the conversion in the following cells

In [None]:
# Values obtained from EDA
item_count_522 = 522
item_id_1 = 6065

In [None]:
max_price = sales_train.loc[:, 'item_price'].max()
high_price = sales_train.loc[sales_train.loc[:, 'item_price'] == max_price]

In [None]:
index = high_price.index[0]

sales_train.loc[index, 'item_id'] = item_id_1
sales_train.loc[index, 'item_cnt_day'] = item_count_522
sales_train.loc[index, 'item_price'] = max_price/item_count_522

Further, we saw that the datapoint for plastic bags had a high item count. As this is used to calculate the renevue below, we will not alter this item

We've seen that that `item_cnt_day` are of floats, to speed up calculations, we transform them to integers

In [None]:
sales_train.loc[:, 'item_cnt_day'] = sales_train.loc[:, 'item_cnt_day'].astype(np.int32)

# 3 Making the base set

We will here use the trick from the ensembling exercise where we create an outer product of the `item_id` and `shop_id` present in each block.

Recall that we are predicting for the `34`th month

In [None]:
sales_test.loc[:, 'date_block_num'] = 34

In [None]:
data = []
cols = ['date_block_num','shop_id','item_id']

# The training part
for block in sales_train.loc[:,'date_block_num'].unique():
    tmp = sales_train.loc[sales_train.loc[:, 'date_block_num'] == block, cols]
    # NOTE: Here we make an outer product of 'date_block_num','shop_id' and 'item_id'
    data.append(np.array(list(product([block], tmp.loc[:,'shop_id'].unique(), tmp.loc[:,'item_id'].unique()))))
    
# Make a sorted dataset of the list
data = pd.DataFrame(np.vstack(data), columns=cols).sort_values(cols)

We do not want to expand the test set, as we will use this for predictions

In [None]:
data = pd.concat([data, sales_test.loc[:, ['date_block_num','shop_id','item_id']]], axis=0)

# 4 Generating aggregated features

## Adding shop features

In [None]:
shop_drop_cols = ['item_price', 'item_cnt_day', 'item_category_id', 'revenue']
shop = pd.merge(sales_train, items.loc[:, ['item_id', 'item_category_id']], how='left', on=['item_id'])

In [None]:
shop.loc[:, 'revenue'] = shop.loc[:, 'item_price'] * shop.loc[:, 'item_cnt_day']

### Sum aggregates

In [None]:
shop_sum = shop.copy()

In [None]:
shop_sum.loc[:, 'month_shop_revenue_sum'] = \
    shop_sum.loc[:, ['date_block_num', 'shop_id', 'revenue']].\
    groupby(['date_block_num', 'shop_id'])['revenue'].transform(sum)

In [None]:
shop_sum.loc[:, 'month_shop_item_id_item_cnt_sum'] = \
    shop_sum.loc[:, ['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].\
    groupby(['date_block_num', 'shop_id', 'item_id'])['item_cnt_day'].transform(sum)

In [None]:
shop_sum.loc[:, 'month_shop_item_cnt_sum'] = \
    shop_sum.loc[:, ['date_block_num', 'shop_id', 'item_cnt_day']].\
    groupby(['date_block_num', 'shop_id'])['item_cnt_day'].transform(sum)

In [None]:
shop_sum.drop(shop_drop_cols, axis=1, inplace=True)
shop_sum.head()

### Mean aggregates

In [None]:
shop_mean = shop.copy()

In [None]:
shop_mean.loc[:, 'shop_revenue_avg'] = \
    shop_mean.loc[:, ['shop_id', 'revenue']].\
    groupby(['shop_id'])['revenue'].transform(np.mean)

In [None]:
shop_mean.loc[:, 'month_shop_item_cnt_avg'] = \
    shop_mean.loc[:, ['date_block_num', 'shop_id', 'item_cnt_day']].\
    groupby(['date_block_num', 'shop_id'])['item_cnt_day'].transform(np.mean)

In [None]:
shop_mean.loc[:, 'month_shop_item_id_item_cnt_avg'] = \
    shop_mean.loc[:, ['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].\
    groupby(['date_block_num', 'shop_id', 'item_id'])['item_cnt_day'].transform(np.mean)

In [None]:
shop_mean.loc[:, 'month_shop_item_cat_item_cnt_avg'] = \
    shop_mean.loc[:, ['date_block_num', 'shop_id', 'item_category_id', 'item_cnt_day']].\
    groupby(['date_block_num', 'shop_id', 'item_category_id'])['item_cnt_day'].transform(np.mean)

In [None]:
shop_mean.drop(shop_drop_cols, axis=1, inplace=True)
shop_mean.head()

### Other aggregates

In [None]:
shop_other = shop.copy()

In [None]:
# NOTE: We use a slightly different notation on this feature as it can be merged on shop instead on shop-item
shop_other.loc[:, 'shop_unique_items_len'] = \
    shop_other.loc[:, ['shop_id', 'item_id']].\
    groupby('shop_id')['item_id'].transform(lambda item_id_group: len(np.unique(item_id_group)))

In [None]:
shop_other.loc[:, 'month_shop_item_id_item_cnt_max'] = \
    shop_other.loc[:, ['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].\
    groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].transform(max)

In [None]:
shop_other.loc[:, 'month_shop_item_id_item_cnt_min'] = \
    shop_other.loc[:, ['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].\
    groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].transform(min)

In [None]:
shop_other.drop(shop_drop_cols, axis=1, inplace=True)
shop_other.head()

## Adding non-shop features

In [None]:
non_shop_drop_cols = ['item_price', 'item_category_id', 'item_cnt_day']
non_shop = pd.merge(sales_train, items.loc[:, ['item_id', 'item_category_id']], how='left', on=['item_id'])

Aggregates by item and month

In [None]:
non_shop.loc[:, 'month_item_id_item_cnt_sum'] = \
    non_shop.loc[:, ['date_block_num', 'item_id', 'item_cnt_day']].\
    groupby(['date_block_num', 'item_id'])['item_cnt_day'].transform(sum)

In [None]:
non_shop.loc[:, 'month_item_id_item_cnt_avg'] = \
    non_shop.loc[:, ['date_block_num', 'item_id', 'item_cnt_day']].\
    groupby(['date_block_num', 'item_id'])['item_cnt_day'].transform(np.mean)

In [None]:
non_shop.loc[:, 'month_item_cat_item_cnt_avg'] = \
    non_shop.loc[:, ['date_block_num', 'item_category_id', 'item_cnt_day']].\
    groupby(['date_block_num', 'item_category_id'])['item_cnt_day'].transform(np.mean)

In [None]:
non_shop.loc[:, 'month_item_cnt_avg'] = \
    non_shop.loc[:, ['date_block_num', 'item_cnt_day']].\
    groupby(['date_block_num'])['item_cnt_day'].transform(np.mean)

In [None]:
non_shop.loc[:, 'month_item_id_item_cnt_max'] = \
    non_shop.loc[:, ['date_block_num', 'item_id', 'item_cnt_day']].\
    groupby(['date_block_num', 'item_id'])['item_cnt_day'].transform(max)

In [None]:
non_shop.loc[:, 'month_item_id_item_cnt_min'] = \
    non_shop.loc[:, ['date_block_num', 'item_id', 'item_cnt_day']].\
    groupby(['date_block_num', 'item_id'])['item_cnt_day'].transform(min)

In [None]:
non_shop.drop(non_shop_drop_cols, axis=1, inplace=True)
non_shop.head()

#### Merge and clean-up

In [None]:
shop_sum.drop_duplicates(inplace=True)
shop_mean.drop_duplicates(inplace=True)
shop_other.drop_duplicates(inplace=True)
non_shop.drop_duplicates(inplace=True)

In [None]:
merge_on = ['date', 'date_block_num', 'shop_id', 'item_id']

aggregates = pd.merge(shop_sum, shop_mean, how='left', on=merge_on)
aggregates = pd.merge(aggregates, shop_other, how='left', on=merge_on)
aggregates = pd.merge(aggregates, non_shop, how='left', on=merge_on)

We check that we didn't introduce any `NaN`s or that we accidentally expanded the set 

In [None]:
if aggregates.isnull().any().any():
    raise AssertionError('NaNs were created')
    
n_aggregates = aggregates.shape[0]
if n_aggregates > n_train_samples:
    raise AssertionError(f'The training set was expanded: '
                         f'n_aggregates={n_aggregates} and n_train_samples={n_train_samples}')

We have no longer use of the day information of the aggregates, so we remove this and remove the duplicates.

In [None]:
aggregates.drop('date', axis=1, inplace=True)
aggregates.drop_duplicates(inplace=True)
aggregates.reset_index(inplace=True, drop=True)

In [None]:
print(f'Shape = {aggregates.shape}')
aggregates.head()

We rejoin `item_category_id` as we will use this as a categorical feature

In [None]:
aggregates = pd.merge(aggregates, items.loc[:, ['item_id', 'item_category_id']], how='left', on='item_id')
print(f'Shape = {aggregates.shape}')

#### Clipping the target value

In this comptetition the range of the predicted item count sold should be in the range $[0, 20]$.
This leaves us with two choices:

1. Clipping before training
2. Clipping after prediction

The disatvantage of 1. is that this will give us inconsistencies with other features like the revenue.
However, features like the aggregated revenue of a shop is expected to influence the sales of a shop, so it is not that critical that we reduce the correlation between these features.

On the other hand, if we go for option 2., the range of values which the model is trying to learn from is broad. This can lead to low performance as the target space becomes broad and therefore sparse.

Because of this we clip prior to training.

In [None]:
aggregates.loc[:, 'month_shop_item_id_item_cnt_sum'].clip(0, 20, inplace=True)

Finally, we would like to check how the target variable is distributed

In [None]:
fig, ax = plt.subplots()
aggregates.loc[:, 'month_shop_item_id_item_cnt_sum'].hist(ax =ax, bins=21)
ax.set_xlabel('item_cnt_month')
ax.set_ylabel('counts')
ax.grid(True)
plt.tight_layout()

In [None]:
del shop
del shop_sum
del shop_mean
del shop_other
del non_shop
gc.collect()

# 5 Aggregate EDA

We have now added several aggregated features.
In this section, we are in particular interested in exploring the relation between the categorical features and the target variable.

This exploration will be the fundation to create mean encodings, where the purpose is to code the categorical features (with a lot of features) in such a way that the relation with the target variable is taken into account. 

In [None]:
target = 'month_shop_item_id_item_cnt_sum'

In [None]:
shop_item_id_item_cnt_pivot = \
    aggregates.pivot_table(index='shop_id',
                           columns='item_id',
                           values=target,
                           aggfunc='count',
                           fill_value=0)
fig, ax = plt.subplots()
sns.heatmap(shop_item_id_item_cnt_pivot,
            ax=ax,
            cbar=True,
            cmap='viridis', 
            cbar_kws={'label': target})
plt.tight_layout()

We can see that certain items (for example around id $3429$ and $4279$) are sold broadly across all shops, that some shops have quite broad selection (between shop id $24$ and $33$).

We see that with the current encoding, the shops are not clustred around certain items, but are scattered around.

In [None]:
shop_item_cat_item_cnt_pivot = \
    aggregates.pivot_table(index='shop_id',
                           columns='item_category_id',
                           values=target,
                           aggfunc='count',
                           fill_value=0)
fig, ax = plt.subplots()
sns.heatmap(shop_item_cat_item_cnt_pivot,
            ax=ax,
            cbar=True,
            cmap='viridis', 
            cbar_kws={'label': target})
plt.tight_layout()

We observe that a few categories are dominating the sales (like category $40$ and $56$), and the others are contributing less.

In [None]:
item_id_item_cat_item_cnt_pivot = \
    aggregates.pivot_table(index='item_id',
                           columns='item_category_id',
                           values=target,
                           aggfunc='count',
                           fill_value=0)
fig, ax = plt.subplots()
sns.heatmap(item_id_item_cat_item_cnt_pivot,
            ax=ax,
            cbar=True,
            cmap='viridis', 
            cbar_kws={'label': target})
plt.tight_layout()

It does not appear that the `item_id` is sorted in terms of `item_category_id`, hence will both provide information to the data set.

In [None]:
shop_date_block_num_item_cnt_pivot = \
    aggregates.pivot_table(index='shop_id',
                           columns='date_block_num',
                           values=target,
                           aggfunc='count',
                           fill_value=0)
fig, ax = plt.subplots()
sns.heatmap(shop_date_block_num_item_cnt_pivot,
            ax=ax,
            cbar=True,
            cmap='viridis', 
            cbar_kws={'label': target})
plt.tight_layout()

As with the EDA of the raw features, we see that we have seasonal trends, and that a few shops are dominating in terms of quantity.

## Conclusion

From the investigation above, we can argue that it would make sense to create mean encodings for all the categorical features, as they all appear to bring new information to the table.

# 6 Adding mean encoding

We will now mean encode the categorical features above based on how often (on average) the target variable appears in the categorical feature

In [None]:
cat_features = ['item_id', 'shop_id', 'item_category_id']

In [None]:
kf = model_selection.KFold(n_splits=5, shuffle=False)
new_features = []

# Shortly told we will here aggregate a mean on the training set, 
# store it in the validation set and fill the missing values with the global mean
for train_indices, valid_indices in kf.split(aggregates):
    
    # Train/validation split
    train = aggregates.iloc[train_indices]
    valid = aggregates.iloc[valid_indices]
    
    # Mean encoding
    for feature in cat_features:
        # NOTE: The lines below are equivalent to
        # agg_mean = train.loc[:, [feature, target]].groupby(feature)[target].mean()
        # mean_merged_on_valid_feature = 
        #     pd.merge(valid.loc[:, [feature]], agg_mean.to_frame(), how='left', on=feature).loc[:, feature]
        mean_merged_on_valid_feature = valid.loc[:, feature].map(train.groupby(feature)[target].mean())
    
        # Store the results in aggregates
        aggregates.loc[valid_indices, feature + '_mean_enc'] = mean_merged_on_valid_feature
        
global_mean = aggregates.loc[:, target].mean()
aggregates.fillna(global_mean, inplace=True)

In [None]:
aggregates.head()

Let's investigate how correlated the mean encodings are

In [None]:
mean_enc_cols = [col for col in aggregates.columns if 'mean_enc' in col]
corr_coef = aggregates.loc[:,[target, *mean_enc_cols]].corr()

In [None]:
corr_coef.loc[mean_enc_cols, target].to_frame()

In [None]:
fig, ax = plt.subplots()
ax.scatter(aggregates.loc[:, 'item_id_mean_enc'], aggregates.loc[:, target], label='item_id', alpha=0.2)
ax.scatter(aggregates.loc[:, 'shop_id_mean_enc'], aggregates.loc[:, target], label='shop_id', alpha=0.2)
ax.scatter(aggregates.loc[:, 'item_category_id_mean_enc'], aggregates.loc[:, target], label='item_category_id', alpha=0.2)
ax.set_ylabel('Target')
ax.set_xlabel('Encoding')
ax.legend(loc='best', fancybox=True, framealpha=0.5)
plt.tight_layout()

We see that the mean encodings are relatively well correlated with the target, and it doesn't appear that there is too much leakage.

Notice, however that the categorical features may be multivalued with respect to the mean encoding (due to the k-fold validation).
This becomes a problem when we will merge the encoded values to the test set.
To solve this, we will aggregate a mean of the mean encoding and merge this with the data data frame.

In [None]:
aggregates.loc[:, 'item_id_mean_mean_enc'] = \
    aggregates.loc[:, ['item_id_mean_enc', 'item_id']].\
    groupby(['item_id'])['item_id_mean_enc'].transform(np.mean)

aggregates.loc[:, 'shop_id_mean_mean_enc'] = \
    aggregates.loc[:, ['shop_id_mean_enc', 'shop_id']].\
    groupby(['shop_id'])['shop_id_mean_enc'].transform(np.mean)

aggregates.loc[:, 'item_category_id_mean_mean_enc'] = \
    aggregates.loc[:, ['item_category_id_mean_enc', 'item_category_id']].\
    groupby(['item_category_id'])['item_category_id_mean_enc'].transform(np.mean)

In [None]:
mean_mean_enc_cols = [col for col in aggregates.columns if 'mean_mean_enc' in col]
corr_coef = aggregates.loc[:,[target, *mean_mean_enc_cols]].corr()

In [None]:
corr_coef.loc[mean_mean_enc_cols, target].to_frame()

In [None]:
fig, ax = plt.subplots()
ax.scatter(aggregates.loc[:, 'item_id_mean_mean_enc'], aggregates.loc[:, target], label='item_id', alpha=0.2)
ax.scatter(aggregates.loc[:, 'shop_id_mean_mean_enc'], aggregates.loc[:, target], label='shop_id', alpha=0.2)
ax.scatter(aggregates.loc[:, 'item_category_id_mean_mean_enc'], aggregates.loc[:, target], label='item_category_id', alpha=0.2)
ax.set_ylabel('Target')
ax.set_xlabel('Encoding')
ax.legend(loc='best', fancybox=True, framealpha=0.5)
plt.tight_layout()

We see that the main features of the main encoding is preserved.
We therefore remove the original mean encoded features from the aggregated data frame.

In [None]:
aggregates.drop(['item_id_mean_enc', 'shop_id_mean_enc', 'item_category_id_mean_enc'], axis=1, inplace=True)

# 7 Merging the aggregated features with the data dataframe

Before we make the temporal features, we start by mergning the aggregated features with the data dataframe in order to easily add the lagged features to the test set (month $34$ in the data data frame).

**NOTE**: This section is long, and could probably be improved quite a bit. Essentially what happens is that we merge the different part of the aggregated data frame on the correct columns of the data data frame.

In [None]:
n_data_samples = data.shape[0]

We start by merging the item category id with the data data frame as we will later use item category as our merge-on column

In [None]:
merge_on = ['item_id']
data_aggregate = pd.merge(data,
                          items.loc[:, merge_on + ['item_category_id']],
                          how='left', 
                          on=merge_on)

Note that the different columns have different feature dependencies. We must ensure that we are merging the columns on the correct features:

In [None]:
current_cols = set(aggregates.columns)

mean_enc_cols = [col for col in current_cols if 'mean_mean_enc' in col]

current_cols -= set(mean_enc_cols)

In [None]:
month_shop_item_id_cols = [col for col in current_cols if 
                           'month' in col and
                           'shop' in col and
                           'item_id' in col]

current_cols -= set(month_shop_item_id_cols)

In [None]:
month_shop_item_cat_cols = [col for col in current_cols if 
                            'month' in col and
                            'shop' in col and
                            'item_cat' in col]

current_cols -= set(month_shop_item_cat_cols)

In [None]:
month_shop_cols = [col for col in current_cols if 
                   'month' in col and
                   'shop' in col]

current_cols -= set(month_shop_cols)

In [None]:
month_item_id_cols = [col for col in current_cols if 
                      'month' in col and
                      'item_id' in col]

current_cols -= set(month_item_id_cols)

In [None]:
month_item_cat_cols = [col for col in current_cols if 
                       'month' in col and
                       'item_cat' in col]

current_cols -= set(month_item_cat_cols)

In [None]:
month_cols = [col for col in current_cols if 
              'month' in col]

current_cols -= set(month_cols)

In [None]:
shop_cols = [col for col in current_cols if 
            'shop' in col and
             col != 'shop_id']

identifier_cols = current_cols - set(shop_cols)

Check that we did the correct thing

In [None]:
if set(identifier_cols) != {'date_block_num', 'item_category_id', 'item_id', 'shop_id'}:
    raise AssertionError('identifier_cols is not correct')

Merge

In [None]:
merge_on = ['date_block_num', 'shop_id', 'item_id']
data_aggregate = pd.merge(data_aggregate,
                          aggregates.loc[:, merge_on + month_shop_item_id_cols].drop_duplicates(),
                          how='left', 
                          on=merge_on)

As we have expanded our dataset with shop id - item id combinations that have not been sold during that month, we can safely replace the `NaN`s prior to week $34$ to $0$.

In [None]:
# NOTE: The inplace operator seem not to work (maybe because we are selecting a slice)
#       Hence we use the assign operator
data_aggregate.loc[data_aggregate.loc[:, 'date_block_num']<34, month_shop_item_id_cols] =\
    data_aggregate.loc[data_aggregate.loc[:, 'date_block_num']<34, month_shop_item_id_cols].fillna(0)

In [None]:
merge_on = ['date_block_num', 'shop_id', 'item_category_id']
data_aggregate = pd.merge(data_aggregate,
                          aggregates.loc[:, merge_on + month_shop_item_cat_cols].drop_duplicates(),
                          how='left', 
                          on=merge_on)

Due to the shop id - item id expansion, new combinations of shop id - item category id may appear. 
We can alson safely replace the `NaN`s prior to week $34$ to $0$.

In [None]:
# NOTE: The inplace operator seem not to work (maybe because we are selecting a slice)
#       Hence we use the assign operator
data_aggregate.loc[data_aggregate.loc[:, 'date_block_num']<34, month_shop_item_cat_cols] =\
    data_aggregate.loc[data_aggregate.loc[:, 'date_block_num']<34, month_shop_item_cat_cols].fillna(0)

In [None]:
merge_on = ['date_block_num', 'shop_id']
data_aggregate = pd.merge(data_aggregate, 
                          aggregates.loc[:, merge_on + month_shop_cols].drop_duplicates(), 
                          how='left',
                          on=merge_on)

No new `NaN`s should appear after the above merge as the expansion of the training set does not introduce new shops.

In [None]:
merge_on = ['date_block_num', 'item_id']
data_aggregate = pd.merge(data_aggregate,
                          aggregates.loc[:, merge_on + month_item_id_cols].drop_duplicates(),
                          how='left',
                          on=merge_on)

No new `NaN`s should appear after the above merge as the expansion of the training set does not introduce new items.

In [None]:
merge_on = ['date_block_num', 'item_category_id']
data_aggregate = pd.merge(data_aggregate,
                          aggregates.loc[:, merge_on + month_item_cat_cols].drop_duplicates(),
                          how='left',
                          on=merge_on)

No new `NaN`s should appear after the above merge as the expansion of the training set does not introduce new item categories.

In [None]:
merge_on = ['date_block_num']
data_aggregate = pd.merge(data_aggregate,
                          aggregates.loc[:, merge_on + month_cols].drop_duplicates(), 
                          how='left',
                          on=merge_on)

No new `NaN`s should appear after the above merge as the expansion of the training set does not introduce new months.

In [None]:
merge_on = ['shop_id']
data_aggregate = pd.merge(data_aggregate,
                          aggregates.loc[:, merge_on + shop_cols].drop_duplicates(),
                          how='left', 
                          on=merge_on)

No new `NaN`s should appear after the above merge as the expansion of the training set does not introduce new shops.

Finally, we merge the mean encoded features:

In [None]:
merge_on = ['shop_id']
data_aggregate = pd.merge(data_aggregate, 
                          aggregates.loc[:, merge_on + ['shop_id_mean_mean_enc']].drop_duplicates(),
                          how='left',
                          on=merge_on)

No new `NaN`s should appear after the above merge as the expansion of the training set does not introduce new shops.

In [None]:
merge_on = ['item_id']
data_aggregate = pd.merge(data_aggregate, 
                          aggregates.loc[:, merge_on + ['item_id_mean_mean_enc']].drop_duplicates(),
                          how='left',
                          on=merge_on)

No new `NaN`s should appear after the above merge as the expansion of the training set does not introduce new items.

In [None]:
merge_on = ['item_category_id']
data_aggregate = pd.merge(data_aggregate, 
                          aggregates.loc[:, merge_on + ['item_category_id_mean_mean_enc']].drop_duplicates(),
                          how='left',
                          on=merge_on)

No new `NaN`s should appear after the above merge as the expansion of the training set does not introduce new categories.

We check that we didn't introduce any `NaN`s in the training set or that we accidentally expanded the set 

In [None]:
if data_aggregate.loc[data_aggregate.loc[:, 'date_block_num']<34].isnull().any().any():
    raise AssertionError('NaNs were created')
    
n_data_aggregate = data_aggregate.shape[0]
if n_aggregates > n_data_samples:
    raise AssertionError(f'The set was expanded: '
                         f'n_aggregates={n_aggregates} and n_data_aggregate={n_data_aggregate}')

Only two type of features should now contain `NaN`s in the test set:

* Monthly features - as these are based on item counts not present in the test set
* Item id features - as the test set contains new items

In [None]:
test_nan_features = data_aggregate.loc[data_aggregate.loc[:, 'date_block_num']==34].isnull().any()
print(test_nan_features)

We check that the monthly features contains only `NaN`s

In [None]:
cols_w_month = [col for col in data_aggregate.columns if 'month' in col]
if not data_aggregate.loc[data_aggregate.loc[:, 'date_block_num']==34, cols_w_month].isnull().all().all():
    raise AssertionError('The monthly features contained something else than NaNs')   

We need to fill the `NaN` values for the non-month features

In [None]:
nan_col = [col for col, is_nan in zip(test_nan_features.index, test_nan_features.values) if is_nan]
non_month_nan_cols = [col for col in data_aggregate.columns if col not in cols_w_month and col in nan_col]
print(non_month_nan_cols)

There are several options to fill these values.
As we saw from the EDA of the raw data, only $1.6 %$ of the item ids were present in the test set that was not present in the training set. 
Thus, it should not matter too much what we fill these values with. 
We choose to use the mean encoding trick and fill them with the global mean.

In [None]:
data_aggregate.loc[:, 'item_id_mean_mean_enc'].fillna(global_mean, inplace=True)

# 8 Adding temporal history

As this is a sequential problem, we would like to incorperate some time information into the training set.

Note that all the montly aggregated features we generated above will only be present as lagged features in the test set when we create the lagged features.

In [None]:
def make_lagged(df, col, lags, merge_on, fillna=0):
    """
    Makes lagged features
    
    We make the lag this by adding the lag number to date_block_num
    and merge the result on date_block_num of the corresponding month of the input df.
    
    Parameters
    ----------
    df : DataFrame
        The feature to make lagged features from
    col : str
        The name of the feature
    lags : list
        The number of months to lag
    merge_on : list
        The columns to merge on
    fillna : float
        The value to fill the NaNs with
    
    Returns
    -------
    lag_df : DataFrame
        A DataFrame containing the lagged features
    """
    
    lag_df = df.loc[:, merge_on + [col]].copy()
    
    samples = lag_df.shape[0]
    
    for lag in lags:
        print(f'Processing lag {lag}', end='\r')
        tmp_df = lag_df.copy()
        tmp_df.loc[:, 'date_block_num'] = tmp_df.loc[:, 'date_block_num'] + lag
        new_col = f'{col}_lag_{lag}'
        tmp_df.rename({col: new_col}, axis=1, inplace=True)
        tmp_df.drop_duplicates(inplace=True)
        lag_df = pd.merge(lag_df, tmp_df.loc[:, [*merge_on, new_col]], how='left', on=merge_on)
    
    lag_df.fillna(fillna, inplace=True)
    
    # Drop the original feature
    lag_df.drop(col, axis=1, inplace=True)
    
    cur_samples = lag_df.shape[0]
    if cur_samples > samples:
        raise AssertionError(f'Sample size increased. Old: {samples}, new: {cur_samples}')
    
    return lag_df

Our strategy is to lag the features in the following way:

**Long lag** (i.e. by $1$, $2$, $3$, $6$, $9$ and $12$ months):
These are the features which time trend is belived to be most important for prediction
* Merge on: month, shop, item
    * `month_shop_item_id_item_cnt_sum`
* Merge on: month, shop
    * `month_shop_item_cnt_sum`
* Merge on: month, item
    * `month_item_id_item_cnt_sum`

**Short lag** (i.e. by only $1$ month)
These are the features which time trend is belived to be less important for prediction
* Merge on: month, shop, item
    * `month_shop_item_id_item_cnt_max`
    * `month_shop_item_id_item_cnt_min`
    * `month_shop_item_id_item_cnt_avg`
* Merge on: month, shop, category
    * `month_shop_item_cat_item_cnt_avg`
* Merge on: month, shop
    * `month_shop_item_cnt_avg`
    * `month_shop_revenue_sum`
* Merge on: month, item
    * `month_item_id_item_cnt_max`
    * `month_item_id_item_cnt_min`
    * `month_item_id_item_cnt_avg`
* Merge on: month, category
    * `month_item_cat_item_cnt_avg`
* Merge on: month
    * `month_item_cnt_avg`

**Fillna strategy**:

For the shop-item combinations we can safely fill the resulting `NaN`s with $0$ as the combination simply did not sell anything the previous `lag`-months.
In fact, the same argument goes for all other mergin strategies as well.

## Long time lag

In [None]:
long_lags = [1, 2, 3, 6, 9, 12]

In [None]:
merge_on = ['date_block_num', 'item_id', 'shop_id']
tmp = make_lagged(df=data_aggregate,
                  col='month_shop_item_id_item_cnt_sum',
                  lags=long_lags,
                  merge_on=merge_on)

We make a minor sanity check: 

1. We select a non-frequent number of `'month_shop_item_id_item_cnt_sum'` in the zeroth month
2. We loop throguh the item-shop combinations which has this number in a the zeroth month to check that it is present in all the lag months  
3. We check that the lag values stays the same for the lagged values

In [None]:
col = 'month_shop_item_id_item_cnt_sum'

In [None]:
data_aggregate.loc[data_aggregate.loc[:, 'date_block_num'] == 0, col].value_counts()

In [None]:
non_freq_number = 3

In [None]:
# Brute force search

search_through_df = data_aggregate.loc[(data_aggregate.loc[:, 'date_block_num'] == 0) &
                                       (np.isclose(data_aggregate.loc[:, col], non_freq_number))]

unique_item = search_through_df.loc[:, 'item_id']
unique_shop = search_through_df.loc[:, 'shop_id']
months = long_lags[:-2]

item_id = None
shop_id = None

found = 0

for cur_shop, cur_item in zip(unique_shop, unique_item):
    for cur_month in months:
        entries = data_aggregate.loc[(data_aggregate.loc[:, 'item_id'] == cur_item) &
                                     (data_aggregate.loc[:, 'shop_id'] == cur_shop) &
                                     (data_aggregate.loc[:, 'date_block_num'] == cur_month)]
        if entries.shape[0] != 0:
            found += 1
        else:
            found = 0
            break
            
        if found == len(months):
            # A combination of item_id and shop_id which is present in all months has been found 
            item_id = cur_item
            shop_id = cur_shop
    
    if item_id is not None and shop_id is not None:
        break

if item_id is None and shop_id is None:
    print('No entries matching the search was found')

In [None]:
data_aggregate.loc[(data_aggregate.loc[:, 'item_id'] == item_id) &
                   (data_aggregate.loc[:, 'shop_id'] == shop_id), ['item_id', 'shop_id', 'date_block_num', col]]

This looks like this is a good item-shop combination to use for testing the lagging procedure.

In [None]:
months = set(long_lags).intersection(set(data_aggregate.loc[(data_aggregate.loc[:, 'item_id'] == item_id) &
                                                            (data_aggregate.loc[:, 'shop_id'] == shop_id), 
                                                            'date_block_num']))

for lag in months:
    lagged_val = tmp.loc[(tmp.loc[:, 'date_block_num'] == lag) & 
                   (tmp.loc[:, 'shop_id'] == shop_id) &
                   (tmp.loc[:, 'item_id'] == item_id),
                   merge_on + [f'{col}_lag_{lag}']].drop_duplicates().\
                 loc[:, f'{col}_lag_{lag}']
    if not np.isclose(non_freq_number, lagged_val):
        raise AssertionError('Oh dear, something is wrong with the lag function...')

print('Lag function looks OK!')

In [None]:
data_aggregate = pd.merge(data_aggregate, tmp.drop_duplicates(), how='left', on=merge_on)

In [None]:
def add_lagged_feature(df, col, merge_on, lags):
    """
    Add a feature to the input data frame
    
    Parameters
    ----------
    df : DataFrame
        The data frame to add the lagged features to
    col : str
        The feature to add make the lagged feature of
    merge_on : list
        List of features to merge on
    lags : list
        The lags

    Returns
    -------
    df : DataFrame
        The data frame with added features
    """
    tmp = make_lagged(df=df,
                      col=col,
                      lags=lags,
                      merge_on=merge_on)
    df = pd.merge(df, tmp.drop_duplicates(), how='left', on=merge_on)
    
    return df

In [None]:
merge_on = ['date_block_num', 'shop_id']
data_aggregate = add_lagged_feature(data_aggregate, 'month_shop_item_cnt_sum', merge_on, long_lags)

In [None]:
merge_on = ['date_block_num', 'item_id']
data_aggregate = add_lagged_feature(data_aggregate, 'month_item_id_item_cnt_sum', merge_on, long_lags)

## Short time lag

In [None]:
short_lags = [1]

In [None]:
merge_on = ['date_block_num', 'shop_id', 'item_id']
data_aggregate = add_lagged_feature(data_aggregate, 'month_shop_item_id_item_cnt_max', merge_on, short_lags)
data_aggregate = add_lagged_feature(data_aggregate, 'month_shop_item_id_item_cnt_min', merge_on, short_lags)
data_aggregate = add_lagged_feature(data_aggregate, 'month_shop_item_id_item_cnt_avg', merge_on, short_lags)

In [None]:
merge_on = ['date_block_num', 'shop_id', 'item_category_id']
data_aggregate = add_lagged_feature(data_aggregate, 'month_shop_item_cat_item_cnt_avg', merge_on, short_lags)

In [None]:
merge_on = ['date_block_num', 'shop_id']
data_aggregate = add_lagged_feature(data_aggregate, 'month_shop_item_cnt_avg', merge_on, short_lags)
data_aggregate = add_lagged_feature(data_aggregate, 'month_shop_revenue_sum', merge_on, short_lags)

In [None]:
merge_on = ['date_block_num', 'item_id']
data_aggregate = add_lagged_feature(data_aggregate, 'month_item_id_item_cnt_max', merge_on, short_lags)
data_aggregate = add_lagged_feature(data_aggregate, 'month_item_id_item_cnt_min', merge_on, short_lags)
data_aggregate = add_lagged_feature(data_aggregate, 'month_item_id_item_cnt_avg', merge_on, short_lags)

In [None]:
merge_on = ['date_block_num', 'item_category_id']
data_aggregate = add_lagged_feature(data_aggregate, 'month_item_cat_item_cnt_avg', merge_on, short_lags)

In [None]:
merge_on = ['date_block_num']
data_aggregate = add_lagged_feature(data_aggregate, 'month_item_cnt_avg', merge_on, short_lags)

Verfiy that we did not introduced `NaN`s after the last 

In [None]:
cols_w_lag = [col for col in data_aggregate.columns if '_lag_' in col]
if data_aggregate.loc[data_aggregate.loc[:, 'date_block_num'] > max(long_lags), cols_w_lag].isnull().any().any():
    raise AssertionError('The monthly features contained something else than NaNs')
    
if data_aggregate.shape[0] > n_data_samples:
    raise AssertionError(f'The set was expanded: '
                         f'n_aggregates={n_aggregates} and n_data_aggregate={n_data_aggregate}')

In [None]:
del search_through_df
del tmp
gc.collect()

# 9 Storing the data frame

Our data now consist of the following columns

In [None]:
sorted(data_aggregate.columns)

In [None]:
generated_data = Path('.').absolute().joinpath('generated_data')
generated_data.mkdir(exist_ok=True)

data_aggregate.to_hdf(generated_data.joinpath('data_aggregate.hdf'),
                      key='data_aggregate')