# Generate auxiliary features

# TOC

FIXME

FIXME

FIXME

FIXME

FIXME

FIXME

* [1-A b](#1-A-b)

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
import requests
import re
import nltk
import sklearn
import gc
import pandas as pd
import numpy as np
from pathlib import Path
from dateutil.relativedelta import relativedelta

# Loading the data

In [None]:
data_dir = Path('.').absolute().joinpath('data')

sales_train = pd.read_csv(data_dir.joinpath('sales_train.csv.gz'))
sales_test = pd.read_csv(data_dir.joinpath('test.csv.gz'))
items = pd.read_csv(data_dir.joinpath('items.csv'))
item_categories = pd.read_csv(data_dir.joinpath('item_categories.csv'))
shops = pd.read_csv(data_dir.joinpath('shops.csv'))

In [None]:
n_train_samples = sales_train.shape[0]

Cast the dates to actual dates for easier manipulation

In [None]:
sales_train.loc[:, 'date'] = pd.to_datetime(sales_train.loc[:, 'date'], format='%d.%m.%Y')

# Adding date features

In [None]:
dates = sales_train.loc[:, ['date_block_num', 'date']]

# Add one date for the predict date
# NOTE: The relativedelta module takes care of problems with dates ending with 28, 30, 31
next_month = dates.loc[:, 'date'].max() + relativedelta(months=1)
next_date_block_num = dates.loc[:, 'date_block_num'].max() + 1
test_month = pd.DataFrame({'date_block_num':[next_date_block_num], 'date':[next_month]})
dates = pd.concat([dates, test_month], axis=0)

In [None]:
dates['date'].max()

Recall from the EDA that we found out that the last date in dataset was `2015-10-31`, this means we are going to predict for `2015-11`. 

Further, we note that only the year and month data is present in the test dataset, meaning that using information on the day level does not make sense.

### Standard date features

We here add date features as seasonal trends are present in the dataset

In [None]:
dates.loc[:, 'year'] = dates.loc[:, 'date'].dt.year
dates.loc[:, 'month'] = dates.loc[:, 'date'].dt.month
dates.loc[:, 'days_in_month'] = dates.loc[:, 'date'].dt.days_in_month
dates.loc[:, 'quarter'] = dates.loc[:, 'date'].dt.quarter

### Holidays

We will here generate the number of holidays in the previous month, the current month and the next month

In [None]:
def get_russian_holidays(year):
    """
    Returns a Series of Russian holidays in a given year
    
    Parameters
    ----------
    year : int
        The year to investigate
    
    Returns
    -------
    holidays : Series
        Series of the holidays on datetime64 format
    """
    
    url = f'https://www.timeanddate.com/holidays/russia/{year}'
    html = requests.get(url).content
    # A list is returned
    table_df = pd.read_html(html)[0]
    # Rename
    table_df = table_df.rename(columns={'Date': 'date'})
    holidays = pd.to_datetime(table_df['date'], format='%b %d')
    
    # Replace the year and cast to datetime
    holidays = holidays.apply(lambda x: x.replace(year=year))

    return holidays

In [None]:
def get_year_months_len(df):
    """
    Returns the number of entries grouped by year and month of the input data frame
    
    Parameters
    ----------
    df : DataFrame
        DataFrame with a column named 'date'
    
    Returns
    -------
    df : DataFrame
        The input DataFrame where the number of entries grouped by year and month
        is appended to the column named 'year_month_count' 
    """
    
    new_df = df.copy()
    
    new_df.loc[:, 'year'] = new_df.loc[:, 'date'].dt.year
    new_df.loc[:, 'month'] = new_df.loc[:, 'date'].dt.month
    
    df.loc[:, 'year_month_count'] = new_df.groupby(['year', 'month'])['date'].transform(len)
    
    return df

In [None]:
# NOTE: We include 2012 to get the first prev_holiday_count later
holiday_2012 = get_russian_holidays(2012).to_frame()
holiday_2013 = get_russian_holidays(2013).to_frame()
holiday_2014 = get_russian_holidays(2014).to_frame()
holiday_2015 = get_russian_holidays(2015).to_frame()
holidays = pd.concat([holiday_2012, holiday_2013, holiday_2014, holiday_2015])

In [None]:
holiday_count = get_year_months_len(holidays).rename(columns={'year_month_count': 'holiday_count'})

Let's now generate the previous month holidays count.
We can get that by increasing the month by one (if the holiday count of February was 1 and the holiday count of March was 2, the holiday count of March will be 1).

In [None]:
prev_holiday_count = holiday_count.copy()
prev_holiday_count.loc[:, 'date'] = prev_holiday_count.loc[:, 'date'] + pd.DateOffset(months=1)
prev_holiday_count = prev_holiday_count.rename(columns={'holiday_count': 'prev_holiday_count'})

Likewise, we can find the next month holiday count by subtracting the months by 1

In [None]:
next_holiday_count = holiday_count.copy()
next_holiday_count.loc[:, 'date'] = next_holiday_count.loc[:, 'date'] + pd.DateOffset(months=-1)
next_holiday_count = next_holiday_count.rename(columns={'holiday_count': 'next_holiday_count'})

We drop the `date` and create `year` and `month` features we can merge on. 

**NOTE**: In order to merge the date data smoothly afterwards, we should drop the resulting duplicates

In [None]:
holiday_count.loc[:, 'year'] = holiday_count.loc[:, 'date'].dt.year
holiday_count.loc[:, 'month'] = holiday_count.loc[:, 'date'].dt.month
holiday_count.drop(['date'], axis=1, inplace=True)
holiday_count.drop_duplicates(inplace=True)

In [None]:
prev_holiday_count.loc[:, 'year'] = prev_holiday_count.loc[:, 'date'].dt.year
prev_holiday_count.loc[:, 'month'] = prev_holiday_count.loc[:, 'date'].dt.month
prev_holiday_count.drop(['date'], axis=1, inplace=True)
prev_holiday_count.drop_duplicates(inplace=True)

In [None]:
next_holiday_count.loc[:, 'year'] = next_holiday_count.loc[:, 'date'].dt.year
next_holiday_count.loc[:, 'month'] = next_holiday_count.loc[:, 'date'].dt.month
next_holiday_count.drop(['date'], axis=1, inplace=True)
next_holiday_count.drop_duplicates(inplace=True)

We merge the previous, current and next holiday count into one frame.
The resulting `NaN`s will be locations without vacations.
We start by merging with `dates`, as this contains all relevant `year`-`month` combinations

In [None]:
holidays = pd.merge(dates.loc[:, ['year', 'month']].drop_duplicates(),
                    holiday_count, how='left', on=['year', 'month']).fillna(0)
holidays = pd.merge(holidays, prev_holiday_count, how='left', on=['year', 'month']).fillna(0)
holidays = pd.merge(holidays, next_holiday_count, how='left', on=['year', 'month']).fillna(0)

# Re-shuffle the columns for better overview
holidays = holidays.loc[:, ['year', 'month', 'prev_holiday_count', 'holiday_count', 'next_holiday_count']]

# All columns can be integers
holidays = holidays.astype(np.int32)

# Sort by year and month for better overview
holidays.sort_values(['year', 'month'], inplace=True)
holidays.reset_index(inplace=True, drop=True)

# Add the date block number
holidays.loc[:, 'date_block_num'] = range(holidays.shape[0])

Inspect that we did the correct thing

In [None]:
holidays

This looks correct. Let's merge these to a common data frame.

In [None]:
dates = pd.merge(dates, holidays, how='left', on='date_block_num')

In [None]:
del holiday_count
del next_holiday_count
del prev_holiday_count
del holiday_2012
del holiday_2013
del holiday_2014
del holiday_2015
del holidays
gc.collect()

# Adding leakage features

The leakage features are features where we use information about the test set.

As both shop id and item id are features of the test set, and since these are not related to time, these are leakages.

### Number of ids in train and test

In [None]:
shop_id_train = sales_train.loc[:, 'shop_id']
shop_id_test = sales_test.loc[:, 'shop_id']
shop_id_both = pd.concat([shop_id_train, shop_id_test], axis=0).to_frame()
shop_id_both.loc[:, 'shop_id_count'] = shop_id_both.groupby('shop_id')['shop_id'].transform(len)

# NOTE: Drop duplicated as we want to merge
shop_id_both.drop_duplicates(inplace=True)

In [None]:
item_id_train = sales_train.loc[:, 'item_id']
item_id_test = sales_test.loc[:, 'item_id']
item_id_both = pd.concat([item_id_train, item_id_test], axis=0).to_frame()
item_id_both.loc[:, 'item_id_count'] = item_id_both.groupby('item_id')['item_id'].transform(len)

# NOTE: Drop duplicated as we want to merge
item_id_both.drop_duplicates(inplace=True)

Out of curiosity we check how these are distributed

In [None]:
fig, ax = plt.subplots()
shop_id_both.loc[:, 'shop_id_count'].hist(ax=ax, bins=200)
ax.set_xlabel('shop_id_count')
ax.set_ylabel('count')
plt.tight_layout()

In [None]:
shop_id_both.loc[:, 'shop_id_count'].value_counts().describe()

It appears that the number of rows for each `shop_id` is well spread, and not clustering around a specific number

In [None]:
fig, ax = plt.subplots()
item_id_both.loc[:, 'item_id_count'].hist(ax=ax, bins=200)
ax.set_xlabel('item_id_count')
ax.set_ylabel('count')
plt.tight_layout()

In [None]:
item_id_both.loc[:, 'item_id_count'].value_counts().describe()

In [None]:
item_id_both.head()

### The ID

As we saw from the EDA, we saw that the `ID` was highly correlated to the `shop_id`, so we include it here. Item and shops without an ID will be given `-1` (although we could probably construct a more appropriate `ID` feature if we checked the feature more)

In [None]:
id_df = pd.merge(sales_train, sales_test, how='left', on=['shop_id', 'item_id'])
id_df.loc[:,'ID'].fillna(-1, inplace=True)
id_df.loc[:,'ID'] = id_df.loc[:,'ID'].astype('int32')

### Row number

As the test set contains data after the train data, these will have a higher row number.
Of course, we could be unlucky and have a test set which is shuffled with respect to the training set, but we nevertheless give it a shot

In [None]:
row_train = pd.DataFrame(list(range(len(sales_train.index))), columns=['row_nr'])
row_test = pd.DataFrame(np.array(range(len(sales_test.index)))+row_train.iloc[-1].values[0], columns=['row_nr'])

# Adding text features

Taking into the possibility that the names are correlated to the target, we add some text features as well. We split `item_name`, `shop_name` and `item_category_name` into cyrillic and latin words. We will stem these, and then combine them again before fitting a TF-IDF model to them.

**NOTE**: The TF-IDF model does not care about the relative position of the words, so it is ok if the order is scrambled when recombining the words to sentences again.

We would now like to stem the words (ideally we would like to lemmatize the words, but it looks like the lemmatization for non-english languages are not as readily available at the moment).

**NOTE**: The stemmer casts to lowercase

In [None]:
russian_stemmer = nltk.stem.SnowballStemmer('russian')
english_stemmer = nltk.stem.SnowballStemmer('english')

In [None]:
def separate_cyrillic_latin(words):
    """
    Separates the cyrillic and latin words
    
    Notes
    -----
    This function does not conserve word order
    
    Parameters
    -----------
    words : str
        The string of words to be split
        
    Returns
    -------
    separated_words : str
        The words separated by _SEP_
        Cyrillic words are to the left of the separator, the latin to the right
    """
    
    words_split = words.split(' ')
    cyrillic_words = list()
    latin_words = list()
    
    for word in words_split:
        # https://stackoverflow.com/questions/48255244/python-check-if-a-string-contains-cyrillic-characters
        if re.search('[а-яА-Я]', word) is not None:
            cyrillic_words.append(word)
        else:
            latin_words.append(word)
    
    cyrillic_words = ' '.join(cyrillic_words)
    latin_words = ' '.join(latin_words)
    
    separated_words = f'{cyrillic_words}_SEP_{latin_words}'
    
    return separated_words

In [None]:
def get_text_features(df, col, return_all=False):
    """
    Returns a new DataFrame with added text features
    
    Parameters
    -----------
    df : DataFrame
        The data frame to add the text features to
    col : str
        The column to obtain the text features from
    return_all : bool
        If True, intermediate columns will be returned
        
    Returns
    -------
    df_nlp : DataFrame
        The data frame with the added text features
        * {col}_clean - col column cleaned so that only alphabetical and numerical characters are present 
                        (only returned if return_all is True)
        * cyrillic_latin - column where cyrillic and latin letters has been separated 
                           (only returned if return_all is True)
        * cyrillic - column with only stemmed cyrillic words present (only returned if return_all is True)
        * latin - column with only stemmed latin words present (only returned if return_all is True)
        * {col}_nlp - combination of the cyrillic and latin column described above
        * {col}_cyrillic_words - cyrillic word count
        * {col}_latin_words - latin word count
        * {col}_total_words - total word count
    """
    
    df_nlp = df.copy()
    
    # First we clean the text by removing non-alphabetical characters and non-numeric characters
    
    df_nlp.loc[:, f'{col}_clean'] = \
    df_nlp.loc[:, f'{col}'].apply(lambda s: re.sub('[^а-яА-Яa-zA-Z0-9 ]', ' ', s))

    # Remove duplicated whitespaces
    df_nlp.loc[:, f'{col}_clean'] = \
        df_nlp.loc[:, f'{col}_clean'].apply(lambda s: re.sub(' +',' ', s))
    
    df_nlp.loc[:, 'cyrillic_latin'] = df_nlp.loc[:, f'{col}_clean'].apply(separate_cyrillic_latin)
    df_nlp.loc[:, 'cyrillic'] = df_nlp.loc[:, 'cyrillic_latin'].apply(lambda s: s.split('_SEP_')[0])
    df_nlp.loc[:, 'latin'] = df_nlp.loc[:, 'cyrillic_latin'].apply(lambda s: s.split('_SEP_')[1])
    
    df_nlp.loc[:, 'cyrillic'] = df_nlp.loc[:, 'cyrillic'].apply(russian_stemmer.stem)
    df_nlp.loc[:, 'latin'] = df_nlp.loc[:, 'latin'].apply(english_stemmer.stem)
    
    # Recombine words
    df_nlp.loc[:, f'{col}_nlp'] = df_nlp.loc[:, 'cyrillic'].str[:] + ' ' + df_nlp.loc[:, 'latin'].str[:]
    
    # We add the word count of each type together with the total.
    # The rationale for doing is
    # 1. It's possible that product with complex names are not sold as much
    # 2. In case there is a lot of English words in the product, it could be that it's less sellable in Russia
    # 3. Possible other reasons not mentioned here
    
    df_nlp.loc[:, f'{col}_cyrillic_words'] = \
        df_nlp.loc[:, 'cyrillic'].apply(lambda s: len(s.split(' ')) if s != '' else 0)
    df_nlp.loc[:, f'{col}_latin_words'] = \
        df_nlp.loc[:, 'latin'].apply(lambda s: len(s.split(' ')) if s != '' else 0)
    
    # NOTE: This is in fact an interaction feature
    df_nlp.loc[:, f'{col}_total_words'] = \
        df_nlp.loc[:, f'{col}_cyrillic_words'] + df_nlp.loc[:, f'{col}_latin_words']
    
    if not return_all:
        remove = [f'{col}_clean', 'cyrillic_latin', 'cyrillic', 'latin']
        df_nlp.drop(remove, axis=1, inplace=True)
    
    return df_nlp

In [None]:
item_nlp = get_text_features(items, 'item_name')
item_category_nlp = get_text_features(item_categories, 'item_category_name')
shop_nlp = get_text_features(shops, 'shop_name')

Check how many tokens we are dealing with

In [None]:
item_corpus = ' '.join(item_nlp.loc[:, 'item_name_nlp'].values)
item_corpus_tokens = nltk.word_tokenize(item_corpus)
print(f'Unique item_name_tokens {len(set(item_corpus_tokens))}')

In [None]:
item_category_corpus = ' '.join(item_category_nlp.loc[:, 'item_category_name_nlp'].values)
item_category_corpus_tokens = nltk.word_tokenize(item_category_corpus)
print(f'Unique item_category_name_tokens {len(set(item_category_corpus_tokens))}')

In [None]:
shop_corpus = ' '.join(shop_nlp.loc[:, 'shop_name_nlp'].values)
shop_corpus_tokens = nltk.word_tokenize(shop_corpus)
print(f'Unique shop_name_tokens {len(set(shop_corpus_tokens))}')

We should take care not to use all tokens as this may result in a [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality). 
Let's see how the words are distributed

In [None]:
samples = 30

In [None]:
plt.figure()
fd_item = nltk.FreqDist(item_corpus_tokens)
fd_item.plot(samples, cumulative=False)
plt.tight_layout()

In [None]:
plt.figure()
fd_item_category = nltk.FreqDist(item_category_corpus_tokens)
fd_item_category.plot(samples, cumulative=False)
plt.tight_layout()

In [None]:
plt.figure()
fd_shop = nltk.FreqDist(shop_corpus_tokens)
fd_shop.plot(samples, cumulative=False)
plt.tight_layout()

We can see that a couple of words constitutes the most of the corpuses. In other words we can expect a high information gain from the first couple of features and diminishing returns as we add more words. We will from graphical inspection try with max features $35$ for TF-IDF for the item corpus, $25$ for the item category corpus and $10$ for the shop corpus. 

In [None]:
item_features = 35
item_category_features = 25
shop_features = 10

In [None]:
tf_idf_item_vec = sklearn.feature_extraction.text.TfidfVectorizer(max_features=item_features)
tf_idf_item = tf_idf_item_vec.fit_transform(item_nlp['item_name_nlp']).toarray()

In [None]:
tf_idf_item_category_vec = sklearn.feature_extraction.text.TfidfVectorizer(max_features=item_category_features)
tf_idf_item_category = tf_idf_item_category_vec.fit_transform(item_category_nlp['item_category_name_nlp']).toarray()

In [None]:
tf_idf_shop_vec = sklearn.feature_extraction.text.TfidfVectorizer(max_features=shop_features)
tf_idf_shop = tf_idf_shop_vec.fit_transform(shop_nlp['shop_name_nlp']).toarray()

Combine the TF-IDF results with the corresponding data frames

In [None]:
col_names = [f'item_tf_idf_{i}' for i in range(tf_idf_item.shape[1])]
tf_idf_item_df = pd.DataFrame(tf_idf_item, columns=col_names)
item_nlp = pd.concat([item_nlp, tf_idf_item_df], axis=1)
item_nlp.drop(['item_name', 'item_category_id', 'item_name_nlp'], axis=1, inplace=True)

In [None]:
col_names = [f'item_category_tf_idf_{i}' for i in range(tf_idf_item_category.shape[1])]
tf_idf_item_category_df = pd.DataFrame(tf_idf_item_category, columns=col_names)
item_category_nlp = pd.concat([item_category_nlp, tf_idf_item_category_df], axis=1)
item_category_nlp.drop(['item_category_name', 'item_category_name_nlp'], axis=1, inplace=True)

In [None]:
col_names = [f'shop_tf_idf_{i}' for i in range(tf_idf_shop.shape[1])]
tf_idf_shop_df = pd.DataFrame(tf_idf_shop, columns=col_names)
shop_nlp = pd.concat([shop_nlp, tf_idf_shop_df], axis=1)
shop_nlp.drop(['shop_name', 'shop_name_nlp'], axis=1, inplace=True)

# Type casting?

Only makes sense for tree methods

# Feature dropping

Let's now drop the features which are no longer needed.
We will only keep those who are needed for the training.

**NOTE**: Although tempting we should not get rid of `item_id` and `shop_id` even though we got `ID`, as these are used to identify the objects under investigation. Also note that these features are not ordinal, and it is possible that some very clever label encoding for these exists.

In [None]:
drop_cols = ['date', 
             'item_category_id', 
             'item_category_name', 
             'item_cnt_day', 
             'item_name',
             'item_price',
             'revenue',
             'shop_name']
merged_train.drop(drop_cols, axis=1, inplace=True)

In [None]:
drop_cols = [col for col in drop_cols if col in merged_test.columns]
merged_test.drop(drop_cols, axis=1, inplace=True)

**NOTE**: There is no need to [normalize the target](https://stats.stackexchange.com/questions/111467/is-it-necessary-to-scale-the-target-value-in-addition-to-scaling-features-for-re), however, we will use the target as a temporal feature (see below), which means that the temporal feature needs to be normalized.

# Merging

In [None]:
merged_train = pd.concat([merged_train, row_train], axis=1)
merged_test = pd.concat([merged_test, row_test], axis=1)

We see that most item ids are present only a couple of times. This means that we have little amount of item level information for most items.

In [None]:
merged_train = pd.merge(merged_train, item_id_both, how='left', on=['item_id'])
merged_train = pd.merge(merged_train, shop_id_both, how='left', on=['shop_id'])

merged_test = pd.merge(merged_test, item_id_both, how='left', on=['item_id'])
merged_test = pd.merge(merged_test, shop_id_both, how='left', on=['shop_id'])

Merge with the train set and the test set

In [None]:
merged_train = pd.merge(merged_train, item_nlp, how='left', on=['item_id'])
merged_train = pd.merge(merged_train, item_category_nlp, how='left', on=['item_category_id'])
merged_train = pd.merge(merged_train, shop_nlp, how='left', on=['shop_id'])

In [None]:
merged_test = pd.merge(merged_test, item_nlp, how='left', on=['item_id'])
merged_test = pd.merge(merged_test, item_category_nlp, how='left', on=['item_category_id'])
merged_test = pd.merge(merged_test, shop_nlp, how='left', on=['shop_id'])

In [None]:
merged_train.loc[:, 'target_month'] = merged_train.loc[:, 'target'].copy()
target = merged_train.loc[:, ['ID', 'item_id', 'shop_id', 'date_block_num', 'target']].copy()
merged_train.drop('target', axis=1, inplace=True)

**NOTE**: Left-joining the data frames with the sales on the left conserves the training rows

In [None]:
merged_train = pd.merge(sales_train, items, how='left', on=['item_id'])
merged_train = pd.merge(merged_train, item_categories, how='left', on=['item_category_id'])
merged_train = pd.merge(merged_train, shops, how='left', on=['shop_id'])

**NOTE**: As we are making several `group_by` statements when generating the features, we should be careful and not combine the train and test set prior to the generation. Instead we should merge the generated features when possible, if not we should separately generate these features for the test set.

In [None]:
merged_test = pd.merge(sales_test, items, how='left', on=['item_id'])
merged_test = pd.merge(merged_test, item_categories, how='left', on=['item_category_id'])
merged_test = pd.merge(merged_test, shops, how='left', on=['shop_id'])

In [None]:
merged_train.head()

In [None]:
merged_test.head()

Note that we already have the following encoding:

- `shop_id` - `shop_name`
- `item_id` - `item_name`
- `item_category_id` - `item_category_name`

Merge `shop_n_products` to `merged_test`

In [None]:
shop_products = merged_train.loc[:, ['shop_n_products', 'shop_id', 'item_id']]
shop_products.drop_duplicates(inplace=True)
merged_test = pd.merge(merged_test, shop_products,
                       how='left', on=['shop_id', 'item_id'])

In case a shop-item doens't exist in the train set, but in the test set we will set it to $-1$ as we do not know its value

In [None]:
merged_test.loc[:, 'shop_n_products'].fillna(-1, inplace=True)

# Normalization

We recall that tree-based models does not depend on the normalization, but non-tree-based models hugely depend on them. As we plan to use ensemble methods for the predictions we should normalize our data. Let's go through the normalization strategy for each of the features:

We will use MaxMinScaler on these ordinal features
* `ID`
* `date_block_num`
* `item_id` 
* `month`
* `quarter`
* `row_nr`
* `shop_id`
* `year`

We will use StandardScaler on these numerical features in order to keep the distribution
* `days_in_month`
* `holiday_count`
* `item_category_name_cyrillic_words`
* `item_category_name_latin_words`
* `item_category_name_total_words`
* `item_id_count`
* `item_id_mean_enc_target_month`
* `item_count_high_month`
* `item_count_low_month`
* `item_count_mean_month`
* `item_count_sum_month`
* `item_name_cyrillic_words`
* `item_name_latin_words`
* `item_name_total_words`
* `item_price`
* `next_holiday_count`
* `prev_holiday_count`
* `revenue`
* `shop_id_count`
* `shop_item_cnt_month`
* `shop_item_count_high_month`
* `shop_item_count_low_month`
* `shop_item_count_mean_month`
* `shop_n_products`
* `shop_name_cyrillic_words`
* `shop_name_latin_words`
* `shop_name_total_words`
* `shop_revenue_month`
* `target_month`

These are already normalized
* `item_category_tf_idf_*`
* `item_tf_idf_*`
* `shop_tf_idf_*`

Due to time constraints, we will not use the Rank scaler or recast distribution (so that the become more Gaussian) with functions like `np.log`, although this could improve the quality of prediction from for example neural nets.

We make copies of `ID`, `date_block_num`, `item_id` and `shop_id` so that we can operate with one scaled and one unscaled version

In [None]:
copies = ['ID', 'date_block_num', 'item_id', 'shop_id']

for copy in copies:
    merged_train.loc[:, f'{copy}_scaled'] = merged_train.loc[:, f'{copy}'].copy()
    merged_test.loc[:, f'{copy}_scaled'] = merged_train.loc[:, f'{copy}'].copy()

**NOTE**: We postpone normalization of `date_block_num`, `item_id` and `shop_id` as we will use these to merge temporal features on later

In [None]:
features_max_min = [
    'ID_scaled',
    'date_block_num_scaled',
    'item_id_scaled',
    'month',
    'quarter',
    'row_nr',
    'shop_id_scaled',
    'year']

max_min_scaler = sklearn.preprocessing.MinMaxScaler()

max_min_scaler.fit(merged_train.loc[:, features_max_min])
merged_train.loc[:, features_max_min] = max_min_scaler.transform(merged_train.loc[:, features_max_min])

**NOTE**: We use the same scaler for train and test

In [None]:
features_max_min = [col for col in features_max_min if col in merged_test]
merged_test.loc[:, features_max_min] = max_min_scaler.transform(merged_test.loc[:, features_max_min])

In [None]:
features_standard_scaler = [
    'days_in_month',
    'holiday_count',
    'item_category_name_cyrillic_words',
    'item_category_name_latin_words',
    'item_category_name_total_words',
    'item_id_count',
    'item_id_mean_enc_target_month',
    'item_count_high_month',
    'item_count_low_month',
    'item_count_mean_month',
    'item_count_sum_month',
    'item_name_cyrillic_words',
    'item_name_latin_words',
    'item_name_total_words',
    'next_holiday_count',
    'prev_holiday_count',
    'shop_id_count',
    'shop_item_cnt_month',
    'shop_item_count_high_month',
    'shop_item_count_low_month',
    'shop_item_count_mean_month',
    'shop_n_products',
    'shop_name_cyrillic_words',
    'shop_name_latin_words',
    'shop_name_total_words',
    'shop_revenue_month',
    'target_month'
    ]

standard_scaler = sklearn.preprocessing.StandardScaler()

test_features_standard_scaler = \
    [col for col in features_standard_scaler if col in merged_test]
train_features_standard_scaler = set(features_standard_scaler) - set(test_features_standard_scaler)

# Exlusive train features
standard_scaler.fit(merged_train.loc[:, train_features_standard_scaler])
merged_train.loc[:, train_features_standard_scaler] = \
    standard_scaler.transform(merged_train.loc[:, train_features_standard_scaler])

# Train and test features
standard_scaler.fit(merged_train.loc[:, test_features_standard_scaler])
merged_train.loc[:, test_features_standard_scaler] = \
    standard_scaler.transform(merged_train.loc[:, test_features_standard_scaler])

In [None]:
merged_test.loc[:, test_features_standard_scaler] = \
    standard_scaler.transform(merged_test.loc[:, test_features_standard_scaler])

# Checking correlation

Out of curiosity we check how the correlation map looks

In [None]:
fig, ax = plt.subplots()
corr = merged_train.corr()
cax = ax.matshow(corr)
fig.colorbar(cax)

## Fill in the blanks

We note that the merging process has created a lot of NaNs

In [None]:
check_col = 'shop_revenue_month_lag_1'
isnull = merged_train.loc[:, f'{check_col}'].isnull()

isnull_pct = 100*merged_train.loc[isnull, f'{check_col}'].shape[0]/merged_train.shape[0]

print(f'{isnull_pct:.2f} % of the rows of {check_col} in the training set bcontains NaNs')

Although there may exist smarter ways to fill the NaN values (with the mean, median or a reconstructed values), we will simply fill them with $0$.

In [None]:
merged_train.fillna(0, inplace=True)
merged_test.fillna(0, inplace=True)

**NOTE**: With this method we need to throw away `n_lag` months of our training set

# Storing the data frames

Guard

In [None]:
diff = set(merged_train.columns).symmetric_difference(set(merged_test.columns))

if len(diff) != 0:
    raise AssertionError(f'Difference in columns of merged_train and merged_test found:\n{diff}')
    
if n_train_samples != merged_train.shape[0]:
    raise AssertionError(f'Train samples introduced:\nn_train_samples={n_train_samples}, '
                         f'merged_train.shape[0]={merged_train.shape[0]}')

if n_test_samples != merged_test.shape[0]:
    raise AssertionError(f'Test samples introduced:\nn_train_samples={n_test_samples}, '
                         f'merged_test.shape[0]={merged_test.shape[0]}')

In [None]:
generated_data = Path('.').absolute().joinpath('generated_data')
generated_data.mkdir(exist_ok=True)

merged_train.to_hdf(generated_data.joinpath('train.hdf'), key='train')
target.to_hdf(generated_data.joinpath('target.hdf'), key='target')
merged_test.to_hdf(generated_data.joinpath('test.hdf'), key='test')