# Feature engineering

## Initialisation

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import itertools
import time
import gc
from tqdm import notebook
from time import sleep

from pathlib import Path
import sys
ROOT = Path().resolve().parent
sys.path.append(str(ROOT))

from src.config import Config

In [None]:
config = Config()

# Main dataset
sales = pd.read_parquet(config.get('cleaned_parquet'))

# Data-Dicts
items = pd.read_csv(config.get('items'))
items_categories = pd.read_csv(config.get('item_categories'))
shops = pd.read_csv(config.get('shops'))

# Folder - Sumbission data 
submission = pd.read_csv(config.get('submission'))
# a sample submission file in the correct format.

test = pd.read_csv(config.get('test')) 
# the test set. You need to forecast the sales 
# for these shops and products for November 2015.


In [8]:
sales.head(5)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,was_item_price_outlier,was_item_cnt_day_outlier
0,2013-01-01,0,2,991,99.0,1.0,0,0
1,2013-01-01,0,2,1472,2599.0,1.0,0,0
2,2013-01-01,0,2,1905,249.0,1.0,0,0
3,2013-01-01,0,2,2920,599.0,2.0,0,0
4,2013-01-01,0,2,3320,1999.0,1.0,0,0


## full_df - full schema of shops with dicts and target columns adding 

### Creating full schema of montly sold items for every shop - { df } 

In [None]:
all_obs_combination_by = ['date_block_num', 'shop_id', 'item_id']
all_shops_items = []

for block_num in sales['date_block_num'].unique():
    unique_shops = sales[sales['date_block_num'] == block_num]['shop_id'].unique()
    unique_items = sales[sales['date_block_num'] == block_num]['item_id'].unique()
    all_shops_items.append(np.array(list(itertools.product([block_num], unique_shops, unique_items)),\
                                      dtype='int32'))

df = pd.DataFrame(np.vstack(all_shops_items), columns=all_obs_combination_by, dtype='int32')
df # full schema with all unique combinations of month number, shop_id, and item_id for month

Unnamed: 0,date_block_num,shop_id,item_id
0,0,2,991
1,0,2,1472
2,0,2,1905
3,0,2,2920
4,0,2,3320
...,...,...,...
10812856,33,36,14199
10812857,33,36,6464
10812858,33,36,3327
10812859,33,36,19370


### Making a target feature and outliers flags from basic dataframe - { aggregated }

In [None]:
aggregated = sales.groupby(all_obs_combination_by).agg({'item_price'  : 'mean', 'item_cnt_day': \
                                                        'sum','was_item_price_outlier':'mean', \
                                                            'was_item_cnt_day_outlier':'mean'})
aggregated.rename(columns={'item_cnt_day': 'target'}, inplace = True)
aggregated[aggregated['was_item_cnt_day_outlier']!=0] # creating additional column for sales, as a target

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_price,target,was_item_price_outlier,was_item_cnt_day_outlier
date_block_num,shop_id,item_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,4,2972,599.000000,10.0,0.0,0.200000
0,4,13350,30.000000,29.0,0.0,0.272727
0,4,17717,401.875000,19.0,0.0,0.125000
0,6,2972,599.000000,24.0,0.0,0.428571
0,6,2973,2498.931818,27.0,0.0,0.181818
...,...,...,...,...,...,...
33,57,20949,5.000000,101.0,0.0,0.166667
33,58,3837,3499.000000,5.0,0.0,1.000000
33,58,20949,5.000000,65.0,0.0,0.037037
33,59,1578,5999.000000,6.0,1.0,0.500000


### Merging full schema with agregated by target basic dataframe - contact { full_df } + {test}

In [59]:
full_df = pd.merge(df, aggregated, on = all_obs_combination_by, how = 'left')
full_df.fillna(value = 0, inplace= True)

test['date_block_num'] = 34

full_df = pd.concat([full_df, test], ignore_index= True)
full_df = full_df.drop('ID',axis=1)

full_df

Unnamed: 0,date_block_num,shop_id,item_id,item_price,target,was_item_price_outlier,was_item_cnt_day_outlier
0,0,2,991,99.0,1.0,0.0,0.0
1,0,2,1472,2599.0,3.0,0.0,0.0
2,0,2,1905,249.0,3.0,0.0,0.0
3,0,2,2920,598.9,7.0,0.0,0.0
4,0,2,3320,1999.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...
11027056,34,45,18454,,,,
11027057,34,45,16188,,,,
11027058,34,45,15757,,,,
11027059,34,45,19648,,,,


In [60]:
(full_df[full_df['target']==0]).shape[0] /  full_df.shape[0]

0.8348087491308881

### Adding dicts on { full_df }

In [61]:
print(shops.head(5))
print('\n')
print(items.head(1))
print('\n')
print(items_categories.head(1))

                        shop_name  shop_id
0   !Якутск Орджоникидзе, 56 фран        0
1   !Якутск ТЦ "Центральный" фран        1
2                Адыгея ТЦ "Мега"        2
3  Балашиха ТРК "Октябрь-Киномир"        3
4        Волжский ТЦ "Волга Молл"        4


                                   item_name  item_id  item_category_id
0  ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.)         D        0                40


        item_category_name  item_category_id
0  PC - Гарнитуры/Наушники                 0


In [62]:
items_categories['general_item_category_name'] = items_categories['item_category_name'].\
    apply(lambda x: 'Игровые консоли' if x.split()[0] == 'Игровые' else x.split()[0] )
items_categories['general_item_category_name'] = pd.Categorical(items_categories.general_item_category_name).codes
items_categories = items_categories.drop('item_category_name', axis=1)

In [63]:
shops['city'] = shops['shop_name'].apply(lambda x: 'Якутск' if x.split()[0] == '!Якутск' else x.split()[0] )
shops['city'] = pd.Categorical(shops.city).codes
shops = shops.drop('shop_name', axis=1)

In [64]:
items = items.drop('item_name', axis=1)

In [65]:
print(shops.head(5))
print('\n')
print(items.head(1))
print('\n')
print(items_categories.head(1))

   shop_id  city
0        0    29
1        1    29
2        2     0
3        3     1
4        4     2


   item_id  item_category_id
0        0                40


   item_category_id  general_item_category_name
0                 0                           0


In [66]:
full_df = full_df.merge(items, on='item_id', how='left')
full_df = full_df.merge(items_categories, on = 'item_category_id', how = 'left')
full_df = full_df.merge(shops, on = 'shop_id', how = 'left')
full_df # sales DataFrame with full schema of monthly items, aggregated by monthes

Unnamed: 0,date_block_num,shop_id,item_id,item_price,target,was_item_price_outlier,was_item_cnt_day_outlier,item_category_id,general_item_category_name,city
0,0,2,991,99.0,1.0,0.0,0.0,67,10,0
1,0,2,1472,2599.0,3.0,0.0,0.0,23,5,0
2,0,2,1905,249.0,3.0,0.0,0.0,30,5,0
3,0,2,2920,598.9,7.0,0.0,0.0,21,5,0
4,0,2,3320,1999.0,4.0,0.0,0.0,19,5,0
...,...,...,...,...,...,...,...,...,...,...
11027056,34,45,18454,,,,,55,9,20
11027057,34,45,16188,,,,,64,10,20
11027058,34,45,15757,,,,,55,9,20
11027059,34,45,19648,,,,,40,7,20


We will add dicts data to sales as well for future tranformations

In [67]:
sales = sales.merge(items, on='item_id', how='left')
sales = sales.merge(items_categories, on = 'item_category_id', how = 'left')
sales = sales.merge(shops, on = 'shop_id', how = 'left')

In [68]:
size_in_bytes = sales.memory_usage(deep=True).sum()
size_in_megabytes = size_in_bytes / (1024 ** 2)

print(f"Memory usage full_df: {size_in_megabytes:.2f} MB")

Memory usage full_df: 167.55 MB


In [69]:
size_in_bytes = full_df.memory_usage(deep=True).sum()
size_in_megabytes = size_in_bytes / (1024 ** 2)

print(f"Memory usage full_df: {size_in_megabytes:.2f} MB")

Memory usage full_df: 694.07 MB


## Item_cnt_day aggregation based on other features

Aggregating target not only for unique combinations, but also in generall for shops, item_id, category, general_category, and city 

### Sum

We will creater features according to our plan:

In [70]:
%%time
# all_obs_combination_by

temp = sales.groupby(by=['date_block_num','item_id'], as_index= False)[['item_cnt_day']].sum()
temp = temp.rename(columns={'item_cnt_day': 'target_by_item_id_total'})
full_df = pd.merge(full_df, temp, on =['date_block_num','item_id'], how= 'left')

temp = sales.groupby(by=['date_block_num','shop_id'], as_index= False)[['item_cnt_day']].sum()
temp = temp.rename(columns={'item_cnt_day': 'target_by_shop_id_total'})
full_df = pd.merge(full_df, temp, on =['date_block_num','shop_id'], how= 'left')

temp = sales.groupby(by=['date_block_num','item_category_id'], as_index= False)[['item_cnt_day']].sum()
temp = temp.rename(columns={'item_cnt_day': 'target_by_category_total'})
full_df = pd.merge(full_df, temp, on =['date_block_num','item_category_id'], how= 'left')

temp = sales.groupby(by=['date_block_num','general_item_category_name'], as_index= False)[['item_cnt_day']].sum()
temp = temp.rename(columns={'item_cnt_day': 'target_by_general_category_total'})
full_df = pd.merge(full_df, temp, on =['date_block_num','general_item_category_name'], how= 'left')

temp = sales.groupby(by=['date_block_num','city'], as_index= False)[['item_cnt_day']].sum()
temp = temp.rename(columns={'item_cnt_day': 'target_by_city_total'})
full_df = pd.merge(full_df, temp, on =['date_block_num','city'], how= 'left')

CPU times: total: 8.22 s
Wall time: 8.38 s


### Mean

In [71]:
%%time
temp = sales.groupby(by=['date_block_num','item_id'], as_index= False)[['item_cnt_day']].mean()
temp = temp.rename(columns={'item_cnt_day': 'target_by_item_id_mean'})
full_df = pd.merge(full_df, temp, on =['date_block_num','item_id'], how= 'left')

temp = sales.groupby(by=['date_block_num','shop_id'], as_index= False)[['item_cnt_day']].mean()
temp = temp.rename(columns={'item_cnt_day': 'target_by_shop_id_mean'})
full_df = pd.merge(full_df, temp, on =['date_block_num','shop_id'], how= 'left')

temp = sales.groupby(by=['date_block_num','item_category_id'], as_index= False)[['item_cnt_day']].mean()
temp = temp.rename(columns={'item_cnt_day': 'target_by_category_mean'})
full_df = pd.merge(full_df, temp, on =['date_block_num','item_category_id'], how= 'left')

temp = sales.groupby(by=['date_block_num','general_item_category_name'], as_index= False)[['item_cnt_day']].mean()
temp = temp.rename(columns={'item_cnt_day': 'target_by_general_category_mean'})
full_df = pd.merge(full_df, temp, on =['date_block_num','general_item_category_name'], how= 'left')

temp = sales.groupby(by=['date_block_num','city'], as_index= False)[['item_cnt_day']].mean()
temp = temp.rename(columns={'item_cnt_day': 'target_by_city_mean'})
full_df = pd.merge(full_df, temp, on =['date_block_num','city'], how= 'left')

CPU times: total: 10.1 s
Wall time: 10.1 s


These statistics helps us to have a full view of target change across different coordinates. Let's take one observation as example. So, we have an item sold in some amount in the current month at some shop, but now we also know, how many of this item we sold in other shops in this month, and how many items of this category we sold, as well as how many those items we sold in this current city. So, now we have more data, even if we look at one raw observation - all data already gathered and reflected in place.

We still have our data marked as data_block_num == 34, what allowed us not to intercept since we've been grouping this by using unique date_block_nums from original dataframe

Let's also add "soft" aggragated data, which includes target data only from past monthes for every "current" month, with the same logic

In [72]:
full_df.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_price', 'target',
       'was_item_price_outlier', 'was_item_cnt_day_outlier',
       'item_category_id', 'general_item_category_name', 'city',
       'target_by_item_id_total', 'target_by_shop_id_total',
       'target_by_category_total', 'target_by_general_category_total',
       'target_by_city_total', 'target_by_item_id_mean',
       'target_by_shop_id_mean', 'target_by_category_mean',
       'target_by_general_category_mean', 'target_by_city_mean'],
      dtype='object')

In [73]:
full_df.head(5)

Unnamed: 0,date_block_num,shop_id,item_id,item_price,target,was_item_price_outlier,was_item_cnt_day_outlier,item_category_id,general_item_category_name,city,target_by_item_id_total,target_by_shop_id_total,target_by_category_total,target_by_general_category_total,target_by_city_total,target_by_item_id_mean,target_by_shop_id_mean,target_by_category_mean,target_by_general_category_mean,target_by_city_mean
0,0,2,991,99.0,1.0,0.0,0.0,67,10,0,31.0,1151.0,2199.0,11567.0,1151.0,1.068966,1.079737,1.073206,1.174792,1.079737
1,0,2,1472,2599.0,3.0,0.0,0.0,23,5,0,188.0,1151.0,4172.0,46381.0,1151.0,1.175,1.079737,1.092147,1.15479,1.079737
2,0,2,1905,249.0,3.0,0.0,0.0,30,5,0,172.0,1151.0,22005.0,46381.0,1151.0,1.102564,1.079737,1.189588,1.15479,1.079737
3,0,2,2920,598.9,7.0,0.0,0.0,21,5,0,146.0,1151.0,3508.0,46381.0,1151.0,1.15873,1.079737,1.084389,1.15479,1.079737
4,0,2,3320,1999.0,4.0,0.0,0.0,19,5,0,187.0,1151.0,8980.0,46381.0,1151.0,1.206452,1.079737,1.137285,1.15479,1.079737


## First month sold items - { first_month_item_id } 

In [None]:
full_df['not_full_historical_data'] = 0

We would like to see how many items were sold first time in which monthes:

In [75]:
first_month = full_df.groupby('item_id', as_index=False)['date_block_num'].min()
first_month['date_block_num'].value_counts().sort_index() # checking - no transformations in this line

date_block_num
0     8110
1     1239
2      799
3      628
4      682
5      531
6      447
7      469
8      384
9      654
10     489
11     536
12     207
13     339
14     310
15     319
16     262
17     253
18     316
19     250
20     330
21     558
22     460
23     472
24     197
25     225
26     336
27     262
28     225
29     250
30     206
31     282
32     302
33     475
34     363
Name: count, dtype: int64

As we can see more items started to be sold in the fisrt month, and it's better to assign to these observations "not_full_historical_data" flag positive value. Cause information which we want to extract with this feature - is items and prices outliers. 

In [76]:
first_month = full_df.groupby('item_id', as_index=False)['date_block_num'].min()
first_month.rename(columns={'date_block_num': 'first_month_item_id_num'}, inplace=True)

full_df = full_df.merge(first_month, on='item_id', how='left')

full_df['first_month_item_id'] = (full_df['date_block_num'] == full_df['first_month_item_id_num']).astype('int8')
full_df = full_df.drop('first_month_item_id_num', axis = 1)
full_df.loc[full_df['date_block_num'] == 0, 'not_full_historical_data'] = 1
full_df['not_full_historical_data'] = full_df['not_full_historical_data'].astype(np.int8)

In [77]:
full_df[(full_df['first_month_item_id'] == 1)&(full_df['was_item_cnt_day_outlier'] == 1) ]\
    ['date_block_num'].value_counts().sort_index()

date_block_num
0      7
1      2
2      5
3      9
4      3
5      3
7      3
8      2
9     37
10    52
12     1
13    29
14     6
16    30
17     5
18    27
19     3
20     8
21    10
22     3
23     2
24     3
26     4
27     3
28     2
29     5
32    11
33     7
Name: count, dtype: int64

We found clear split, where some items were sold in first month and were "target" (item_cnt_day) outliers as well. Despite the fact 'was_item_price_outlier' in full_df DataFrame is aggregated feature with value only rarelly equal to 1.0 .

In [78]:
# full_df[full_df['date_block_num'] == 0]

## Expanding window target aggregation

['shop_id','item_id], ['shop_id], ['item_id']

In [79]:
full_df.head(3)

Unnamed: 0,date_block_num,shop_id,item_id,item_price,target,was_item_price_outlier,was_item_cnt_day_outlier,item_category_id,general_item_category_name,city,...,target_by_category_total,target_by_general_category_total,target_by_city_total,target_by_item_id_mean,target_by_shop_id_mean,target_by_category_mean,target_by_general_category_mean,target_by_city_mean,not_full_historical_data,first_month_item_id
0,0,2,991,99.0,1.0,0.0,0.0,67,10,0,...,2199.0,11567.0,1151.0,1.068966,1.079737,1.073206,1.174792,1.079737,1,1
1,0,2,1472,2599.0,3.0,0.0,0.0,23,5,0,...,4172.0,46381.0,1151.0,1.175,1.079737,1.092147,1.15479,1.079737,1,1
2,0,2,1905,249.0,3.0,0.0,0.0,30,5,0,...,22005.0,46381.0,1151.0,1.102564,1.079737,1.189588,1.15479,1.079737,1,1


In [None]:
aggregating_target_by = [['item_id', 'shop_id'], ['item_id'], ['shop_id']]

In [81]:
%%time

for feature in aggregating_target_by:
    col = '_'.join(['target_aggregated_mean_premonthes', *feature])
    col2 = '_'.join(['target_aggregated_max_premonthes', *feature])
    full_df[col] = np.nan
    full_df[col2] = np.nan

    for d in notebook.tqdm(full_df.date_block_num.unique()):
        valid_month = (full_df.date_block_num < d)
        current_month = (full_df.date_block_num == d)

        temp = full_df.loc[valid_month].groupby(feature)[['target']].mean().reset_index()
        agg = full_df.loc[current_month][feature].merge(temp, on=feature, how='left')[['target']].copy()
        agg.set_index(full_df.loc[current_month].index, inplace=True)
        full_df.loc[current_month, col] = agg['target']

        temp = full_df.loc[valid_month].groupby(feature)[['target']].max().reset_index()
        agg = full_df.loc[current_month][feature].merge(temp, on=feature, how='left')[['target']].copy()
        agg.set_index(full_df.loc[current_month].index, inplace=True)
        full_df.loc[current_month, col2] = agg['target']

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

CPU times: total: 3min 48s
Wall time: 3min 40s


Since this operation takes 6 minutes to load, i will make it easier to debug future steps:

In [None]:
full_df.to_parquet(config.get('interim_parquet'), engine='pyarrow')

## Data Checkpoint

In [None]:
full_df = pd.read_parquet(config.get('interim_parquet'), engine='pyarrow')

### Don't we have some NaNs after target aggreagtion from previous monthes? 

In [4]:
full_df[full_df['date_block_num'] == 1].columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_price', 'target',
       'was_item_price_outlier', 'was_item_cnt_day_outlier',
       'item_category_id', 'general_item_category_name', 'city',
       'target_by_item_id_total', 'target_by_shop_id_total',
       'target_by_category_total', 'target_by_general_category_total',
       'target_by_city_total', 'target_by_item_id_mean',
       'target_by_shop_id_mean', 'target_by_category_mean',
       'target_by_general_category_mean', 'target_by_city_mean',
       'not_full_historical_data', 'first_month_item_id',
       'target_aggregated_mean_premonthes_item_id_shop_id',
       'target_aggregated_max_premonthes_item_id_shop_id',
       'target_aggregated_mean_premonthes_item_id',
       'target_aggregated_max_premonthes_item_id',
       'target_aggregated_mean_premonthes_shop_id',
       'target_aggregated_max_premonthes_shop_id'],
      dtype='object')

Any missed target_aggregated_mean_premonthes_item_id_shop_id from previous monthes?

In [5]:
full_df[(full_df['target_aggregated_mean_premonthes_item_id_shop_id'].isnull())\
         & (full_df['date_block_num']>0)]\
            [['date_block_num']].value_counts().sort_index()

date_block_num
1                 63921
2                 37215
3                 28407
4                 38416
5                 33350
6                 21720
7                 21644
8                 17569
9                 37652
10                22141
11                32757
12                10420
13                16013
14                21757
15                23019
16                14413
17                13260
18                22415
19                20021
20                17733
21                37871
22                23263
23                23852
24                10013
25                10896
26                15819
27                11920
28                10044
29                10846
30                 8976
31                11903
32                12727
33                30402
34                20833
Name: count, dtype: int64

Let's see without monthes when item was sold first time --> has no data to be aggregated from:

In [6]:
full_df[(full_df['target_aggregated_mean_premonthes_item_id_shop_id'].isnull())\
         & (full_df['date_block_num']>0) & (full_df['first_month_item_id']!= 1 )]\
            [['date_block_num']].value_counts().sort_index()

date_block_num
1                 6927
2                  461
3                  147
4                 7726
5                 8924
6                 1158
7                  539
8                  289
9                 7568
10                 136
11                8101
12                 898
13                 419
14                7187
15                7707
16                1837
17                1116
18                6931
19                7521
20                1563
21                9413
22                 723
23                 724
24                 360
25                 321
26                 363
27                 130
28                 144
29                  96
30                 118
31                  59
32                  43
33                9502
34                5587
Name: count, dtype: int64

In [7]:
# # In case we can add flag for lack of info for target_aggregated_mean_premonthes_item_id_shop_id with the following code 
# full_df[(full_df['target_aggregated_mean_premonthes_item_id_shop_id'].isnull())\
#          & (full_df['date_block_num']>0) & (full_df['first_month_item_id']!= 1 )]['not_full_historical_data'] = 1

And now let's see if there any shops which have no historical aggregated data:

In [8]:
# # In case we can add flag for lack of info for shops which were selling first month with the following code 

full_df[(full_df['target_aggregated_mean_premonthes_shop_id'].isnull())\
         & (full_df['date_block_num']>0) & (full_df['first_month_item_id']!= 1 )][['date_block_num']].value_counts().sort_index() # To see what's those NaNs are

# full_df[(full_df['target_aggregated_mean_premonthes_shop_id'].isnull())\
#          & (full_df['date_block_num']>0) & (full_df['first_month_item_id']!= 1 )]['not_full_historical_data'] = 1

date_block_num
1                 6927
4                 7655
5                 7943
9                 7384
11                7937
14                6828
15                6453
18                6290
19                6109
21                5773
33                4936
34                4737
Name: count, dtype: int64

After our target aggregation we can see, that we have NaN values, somewehere, because it's first month when item sold, and there no historical data for that item, not possible to collect info about it from previous monthes. We doing well, bacause those observations still have flag for first month selling, they are marked for the model. But what are remaining NaNs, why they still exist if we filter by first month sold items? Those are when item sold not a first time in this month, but sold first time in this shop. Let's check it below only for item_id target value (was it sold prevoisly and has historical data, and was it first time when it sold?):

In [9]:
full_df[(full_df['target_aggregated_mean_premonthes_item_id'].isnull())\
         & (full_df['date_block_num']>0)]\
            [['date_block_num']].value_counts().sort_index()

date_block_num
1                 56994
2                 36754
3                 28260
4                 30690
5                 24426
6                 20562
7                 21105
8                 17280
9                 30084
10                22005
11                24656
12                 9522
13                15594
14                14570
15                15312
16                12576
17                12144
18                15484
19                12500
20                16170
21                28458
22                22540
23                23128
24                 9653
25                10575
26                15456
27                11790
28                 9900
29                10750
30                 8858
31                11844
32                12684
33                20900
34                15246
Name: count, dtype: int64

In [10]:
full_df[(full_df['target_aggregated_mean_premonthes_item_id'].isnull())\
         & (full_df['date_block_num']>0) & (full_df['first_month_item_id']!= 1 )]\
            [['date_block_num']].shape

(0, 1)

In [11]:
full_df[full_df['date_block_num']==34].head(5)

Unnamed: 0,date_block_num,shop_id,item_id,item_price,target,was_item_price_outlier,was_item_cnt_day_outlier,item_category_id,general_item_category_name,city,...,target_by_general_category_mean,target_by_city_mean,not_full_historical_data,first_month_item_id,target_aggregated_mean_premonthes_item_id_shop_id,target_aggregated_max_premonthes_item_id_shop_id,target_aggregated_mean_premonthes_item_id,target_aggregated_max_premonthes_item_id,target_aggregated_mean_premonthes_shop_id,target_aggregated_max_premonthes_shop_id
10812861,34,5,5037,,,,,19,5,3,...,,,0,0,0.928571,3.0,1.958009,15.0,0.186444,111.0
10812862,34,5,5320,,,,,55,9,3,...,,,0,1,,,,,0.186444,111.0
10812863,34,5,5233,,,,,19,5,3,...,,,0,0,1.428571,3.0,1.673267,10.0,0.186444,111.0
10812864,34,5,5232,,,,,23,5,3,...,,,0,0,0.333333,1.0,1.109375,6.0,0.186444,111.0
10812865,34,5,5268,,,,,20,5,3,...,,,0,1,,,,,0.186444,111.0


### What else? - follow up on the plan

**Basic features:**

- ✅City of the shop  
- ✅Category of the product (item)  
- ✅General product category (item)  
- ✅Monthly sales of exact category in shop `target` (it will be our target) 

**We also should add some lag features based on statistical metrics for exact shops, item_id's, and categories:**

- ✅Total / Mean amount of sold exact `item_id` in this month (How many of this `item_id` we sold in this month?)  
- ✅Total / Mean amount of `item_cnt_day` sold this month in the exact shop (How many items this shop sold this month?)  
- ✅Total / Mean amount of product category sold in this month (How well this category sold during this month?)  
- ✅Total / Mean amount of general product category sold in this month (How well "Movies" category sold during this month?)

**Combined:**
- ❌Mean of how many items of exact category exact shop sell per month (How good this shop sells products of this cateogry?)


**Next:**
- Then we need to add lags for 1, 2, 3, 12 month periods for all features mentioned above  
- ✅Also, would be nice to add a feature like `not_full_historical_data` for the first three months and for monthes in the first year
- Add "deltas" for target - as how amount of sold items been changed for the last monthes of this `shop_id` and `item_id` 
- ✅Add binary feature `first_month_item_id` which reflects if exact item will be sold first time in this month
- ✅❌Add mean sliding window for target for the last three monthes `mean_3`
- Number of month as `month = date_block_num mod 12`
- Add binary feature `shop_was_in_test` and `item_id_was_in_test` for shops and items which will be in test when model will predict

Remaining:
- Number of month as `month = date_block_num mod 12`
- Add binary feature `shop_was_in_test` and `item_id_was_in_test` for shops and items which will be in test when model will predict
- Then we need to add lags for 1, 2, 3, 12 month periods for all features mentioned above  
- Add "deltas" for target - as how amount of sold items been changed for the last monthes of this `shop_id` and `item_id` 

## Year and Month Feature

In [12]:
full_df['month'] = ((full_df['date_block_num'] % 12) + 1).astype(np.int8)

In [13]:
full_df['year'] = (2013 + (full_df['date_block_num'] // 12))

In [14]:
full_df['year'].unique()

array([2013, 2014, 2015], dtype=int64)

## Shop_was_in_test and item_id_was_in_test

In [15]:
shop_id_test = test['shop_id'].unique()
item_id_test = test['item_id'].unique()
print('Nunique shop_id in test: ', test['shop_id'].nunique())
print('Nunique item_id in test: ',test['item_id'].nunique())
print('\n')
print("Unique values of shop_id : ",np.sort(shop_id_test))
print("Unique values of item_id",np.sort(item_id_test))
print('\n')
full_df['item_id_was_in_test'] = 0
full_df['shop_id_was_in_test'] = 0
full_df.loc[(full_df['item_id'].isin(item_id_test))&(full_df['date_block_num'] != 34), 'item_id_was_in_test'] =  1
full_df.loc[full_df['shop_id'].isin(shop_id_test)&(full_df['date_block_num'] != 34), 'shop_id_was_in_test'] =  1
full_df['item_id_was_in_test'] = full_df['item_id_was_in_test'].astype(np.int8)
full_df['shop_id_was_in_test'] = full_df['shop_id_was_in_test'].astype(np.int8)

print('Unique values of shop_id with assigned positive shop_id_was_in_test flag',np.sort(full_df[full_df['shop_id_was_in_test']== 1]['shop_id'].unique())) # Checking the result

Nunique shop_id in test:  42
Nunique item_id in test:  5100


Unique values of shop_id :  [ 2  3  4  5  6  7 10 12 14 15 16 18 19 21 22 24 25 26 28 31 34 35 36 37
 38 39 41 42 44 45 46 47 48 49 50 52 53 55 56 57 58 59]
Unique values of item_id [   30    31    32 ... 22164 22166 22167]


Unique values of shop_id with assigned positive shop_id_was_in_test flag [ 2  3  4  5  6  7 12 14 15 16 18 19 21 22 24 25 26 28 31 34 35 36 37 38
 39 41 42 44 45 46 47 48 49 50 52 53 55 56 57 58 59]


In [16]:
size_in_bytes = full_df.memory_usage(deep=True).sum()
size_in_megabytes = size_in_bytes / (1024 ** 2)

print(f"Memory usage full_df: {size_in_megabytes:.2f} MB")
print(f'Amount of rows in this table: {full_df.shape[0]}')
print(f'Amount of columns in this table: {full_df.shape[1]}')

Memory usage full_df: 2176.86 MB
Amount of rows in this table: 11027061
Amount of columns in this table: 32


In [17]:
full_df.head(1)

Unnamed: 0,date_block_num,shop_id,item_id,item_price,target,was_item_price_outlier,was_item_cnt_day_outlier,item_category_id,general_item_category_name,city,...,target_aggregated_mean_premonthes_item_id_shop_id,target_aggregated_max_premonthes_item_id_shop_id,target_aggregated_mean_premonthes_item_id,target_aggregated_max_premonthes_item_id,target_aggregated_mean_premonthes_shop_id,target_aggregated_max_premonthes_shop_id,month,year,item_id_was_in_test,shop_id_was_in_test
0,0,2,991,99.0,1.0,0.0,0.0,67,10,0,...,,,,,,,1,2013,0,1


In [18]:
# Don't we have any data leakege ? 
full_df.loc[(full_df['date_block_num']== 34)&((full_df['shop_id_was_in_test']!= 0 )|(full_df['item_id_was_in_test']!= 0 )), ['shop_id_was_in_test', 'item_id_was_in_test']].shape[0]

0

## Downcast

In [19]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11027061 entries, 0 to 11027060
Data columns (total 32 columns):
 #   Column                                             Dtype  
---  ------                                             -----  
 0   date_block_num                                     int64  
 1   shop_id                                            int64  
 2   item_id                                            int64  
 3   item_price                                         float64
 4   target                                             float64
 5   was_item_price_outlier                             float64
 6   was_item_cnt_day_outlier                           float64
 7   item_category_id                                   int64  
 8   general_item_category_name                         int8   
 9   city                                               int8   
 10  target_by_item_id_total                            float64
 11  target_by_shop_id_total                         

In [20]:
float_cols = full_df.select_dtypes(include='float64').columns
int_cols = full_df.select_dtypes(include='int64').columns

for col in float_cols:
    max_diff = (full_df[col] - full_df[col].astype('float32')).abs().max()
    print(f"{col}: max precision loss when downcasted to float32 = {max_diff}")

for col in int_cols:
    min_val = full_df[col].min()
    max_val = full_df[col].max()
    if min_val < -2_147_483_648 or max_val > 2_147_483_647:
        print(f"{col}: OVERFLOW when downcasted to int32 (values out of range)")


item_price: max precision loss when downcasted to float32 = 0.00023674272779317107
target: max precision loss when downcasted to float32 = 0.0
was_item_price_outlier: max precision loss when downcasted to float32 = 2.384185793236071e-08
was_item_cnt_day_outlier: max precision loss when downcasted to float32 = 2.8840957178033477e-08
target_by_item_id_total: max precision loss when downcasted to float32 = 0.0
target_by_shop_id_total: max precision loss when downcasted to float32 = 0.0
target_by_category_total: max precision loss when downcasted to float32 = 0.0
target_by_general_category_total: max precision loss when downcasted to float32 = 0.0
target_by_city_total: max precision loss when downcasted to float32 = 0.0
target_by_item_id_mean: max precision loss when downcasted to float32 = 2.3019724881834236e-07
target_by_shop_id_mean: max precision loss when downcasted to float32 = 1.0217939117040942e-07
target_by_category_mean: max precision loss when downcasted to float32 = 1.996062524

In [21]:
def downcast_dtypes(df):
    float_cols = df.select_dtypes(include=['float64']).columns
    int_cols = df.select_dtypes(include=['int64']).columns

    df[float_cols] = df[float_cols].astype('float32')
    df[int_cols] = df[int_cols].astype('int32')

    return df

In [22]:
size_in_bytes = full_df.memory_usage(deep=True).sum()
size_in_megabytes = size_in_bytes / (1024 ** 2)

print(f"Memory usage full_df: {size_in_megabytes:.2f} MB")
print(f'Amount of rows in this table: {full_df.shape[0]}')
print(f'Amount of columns in this table: {full_df.shape[1]}')

Memory usage full_df: 2176.86 MB
Amount of rows in this table: 11027061
Amount of columns in this table: 32


In [23]:
full_df.isna().sum()

date_block_num                                             0
shop_id                                                    0
item_id                                                    0
item_price                                            214200
target                                                214200
was_item_price_outlier                                214200
was_item_cnt_day_outlier                              214200
item_category_id                                           0
general_item_category_name                                 0
city                                                       0
target_by_item_id_total                               214200
target_by_shop_id_total                               214200
target_by_category_total                              214200
target_by_general_category_total                      214200
target_by_city_total                                  214200
target_by_item_id_mean                                214200
target_by_shop_id_mean  

target aggregation by shop, item_id etc. are expectably NaNs for test, for other features we can fill with 0 valu, since amount of sold items in previous monthes is zero

In [24]:
test.shape[0] 

214200

In [25]:
%%time
full_df.fillna(0, inplace=True)
full_df = downcast_dtypes(full_df)

CPU times: total: 5.09 s
Wall time: 5.13 s


In [26]:
size_in_bytes = full_df.memory_usage(deep=True).sum()
size_in_megabytes = size_in_bytes / (1024 ** 2)

print(f"Memory usage full_df: {size_in_megabytes:.2f} MB")
print(f'Amount of rows in this table: {full_df.shape[0]}')
print(f'Amount of columns in this table: {full_df.shape[1]}')

Memory usage full_df: 1125.24 MB
Amount of rows in this table: 11027061
Amount of columns in this table: 32


In [27]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11027061 entries, 0 to 11027060
Data columns (total 32 columns):
 #   Column                                             Dtype  
---  ------                                             -----  
 0   date_block_num                                     int32  
 1   shop_id                                            int32  
 2   item_id                                            int32  
 3   item_price                                         float32
 4   target                                             float32
 5   was_item_price_outlier                             float32
 6   was_item_cnt_day_outlier                           float32
 7   item_category_id                                   int32  
 8   general_item_category_name                         int8   
 9   city                                               int8   
 10  target_by_item_id_total                            float32
 11  target_by_shop_id_total                         

## Lag features

In [28]:
all_obs_combination_by = ['date_block_num', 'shop_id', 'item_id']

In [29]:
shifted_columns = [c for c in full_df if 'target' in c]
shifted_columns

['target',
 'target_by_item_id_total',
 'target_by_shop_id_total',
 'target_by_category_total',
 'target_by_general_category_total',
 'target_by_city_total',
 'target_by_item_id_mean',
 'target_by_shop_id_mean',
 'target_by_category_mean',
 'target_by_general_category_mean',
 'target_by_city_mean',
 'target_aggregated_mean_premonthes_item_id_shop_id',
 'target_aggregated_max_premonthes_item_id_shop_id',
 'target_aggregated_mean_premonthes_item_id',
 'target_aggregated_max_premonthes_item_id',
 'target_aggregated_mean_premonthes_shop_id',
 'target_aggregated_max_premonthes_shop_id']

In [30]:
%%time
all_obs_combination_by = ['date_block_num', 'shop_id', 'item_id']

shift_range = [1, 2, 3, 12]

shifted_columns = [c for c in full_df if 'target' in c]
shifted_columns = shifted_columns + ['was_item_price_outlier', 'was_item_cnt_day_outlier', 'item_price']

for shift in shift_range:
    temp = full_df[all_obs_combination_by + shifted_columns].copy()
    temp['date_block_num'] = temp['date_block_num'] + shift

    foo = lambda x: f'{x}_lag_{shift}' if x in shifted_columns else x
    temp = temp.rename(columns=foo)

    full_df = pd.merge(full_df, temp, on = all_obs_combination_by, how= 'left').fillna(0)
    full_df = downcast_dtypes(full_df)

    del temp
    gc.collect

CPU times: total: 1min 4s
Wall time: 1min 8s


In [31]:
full_df.shape

(11027061, 112)

In [32]:
full_df.tail(5)

Unnamed: 0,date_block_num,shop_id,item_id,item_price,target,was_item_price_outlier,was_item_cnt_day_outlier,item_category_id,general_item_category_name,city,...,target_by_city_mean_lag_12,target_aggregated_mean_premonthes_item_id_shop_id_lag_12,target_aggregated_max_premonthes_item_id_shop_id_lag_12,target_aggregated_mean_premonthes_item_id_lag_12,target_aggregated_max_premonthes_item_id_lag_12,target_aggregated_mean_premonthes_shop_id_lag_12,target_aggregated_max_premonthes_shop_id_lag_12,was_item_price_outlier_lag_12,was_item_cnt_day_outlier_lag_12,item_price_lag_12
11027056,34,45,18454,0.0,0.0,0.0,0.0,55,9,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11027057,34,45,16188,0.0,0.0,0.0,0.0,64,10,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11027058,34,45,15757,0.0,0.0,0.0,0.0,55,9,20,...,1.165751,0.227273,2.0,0.256809,3.0,0.183784,73.0,0.0,0.0,0.0
11027059,34,45,19648,0.0,0.0,0.0,0.0,40,7,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11027060,34,45,969,0.0,0.0,0.0,0.0,37,7,20,...,1.165751,1.0,3.0,1.008097,13.0,0.183784,73.0,0.0,0.0,0.0


In [33]:
np.array(full_df.columns)

array(['date_block_num', 'shop_id', 'item_id', 'item_price', 'target',
       'was_item_price_outlier', 'was_item_cnt_day_outlier',
       'item_category_id', 'general_item_category_name', 'city',
       'target_by_item_id_total', 'target_by_shop_id_total',
       'target_by_category_total', 'target_by_general_category_total',
       'target_by_city_total', 'target_by_item_id_mean',
       'target_by_shop_id_mean', 'target_by_category_mean',
       'target_by_general_category_mean', 'target_by_city_mean',
       'not_full_historical_data', 'first_month_item_id',
       'target_aggregated_mean_premonthes_item_id_shop_id',
       'target_aggregated_max_premonthes_item_id_shop_id',
       'target_aggregated_mean_premonthes_item_id',
       'target_aggregated_max_premonthes_item_id',
       'target_aggregated_mean_premonthes_shop_id',
       'target_aggregated_max_premonthes_shop_id', 'month', 'year',
       'item_id_was_in_test', 'shop_id_was_in_test', 'target_lag_1',
       'target_by_it

In [34]:
[c for c in full_df if 'target' in c]

['target',
 'target_by_item_id_total',
 'target_by_shop_id_total',
 'target_by_category_total',
 'target_by_general_category_total',
 'target_by_city_total',
 'target_by_item_id_mean',
 'target_by_shop_id_mean',
 'target_by_category_mean',
 'target_by_general_category_mean',
 'target_by_city_mean',
 'target_aggregated_mean_premonthes_item_id_shop_id',
 'target_aggregated_max_premonthes_item_id_shop_id',
 'target_aggregated_mean_premonthes_item_id',
 'target_aggregated_max_premonthes_item_id',
 'target_aggregated_mean_premonthes_shop_id',
 'target_aggregated_max_premonthes_shop_id',
 'target_lag_1',
 'target_by_item_id_total_lag_1',
 'target_by_shop_id_total_lag_1',
 'target_by_category_total_lag_1',
 'target_by_general_category_total_lag_1',
 'target_by_city_total_lag_1',
 'target_by_item_id_mean_lag_1',
 'target_by_shop_id_mean_lag_1',
 'target_by_category_mean_lag_1',
 'target_by_general_category_mean_lag_1',
 'target_by_city_mean_lag_1',
 'target_aggregated_mean_premonthes_item_id_s

In [35]:
size_in_bytes = full_df.memory_usage(deep=True).sum()
size_in_megabytes = size_in_bytes / (1024 ** 2)

print(f"Memory usage full_df: {size_in_megabytes:.2f} MB")
print(f'Amount of rows in this table: {full_df.shape[0]}')
print(f'Amount of columns in this table: {full_df.shape[1]}')

Memory usage full_df: 4490.43 MB
Amount of rows in this table: 11027061
Amount of columns in this table: 112


## Deltas

In [36]:
# ## Key Idea
# full_df['target_delta_1_2'] = full_df['target_lag_1'] - full_df['target_lag_2']
# full_df['target_delta_2_3'] = full_df['target_lag_2'] - full_df['target_lag_3']

# full_df['target_predict_1_2'] = full_df['target_lag_1'] + full_df['target_delta_1_2']
# full_df['target_predict_2_3'] = full_df['target_lag_1'] + full_df['target_delta_2_3'] + full_df['target_predict_1_2']

In [37]:
columns_to_delta = ['target', 'target_by_item_id_total', 'target_by_shop_id_total','target_by_category_total',\
                    'target_by_general_category_total', 'target_by_city_total']

for target_predict in columns_to_delta:
    full_df[target_predict + '_delta_1_2'] = full_df[target_predict + '_lag_1'] - full_df[target_predict + '_lag_2']
    full_df[target_predict + '_delta_2_3'] = full_df[target_predict + '_lag_2'] - full_df[target_predict + '_lag_3']

    full_df[target_predict + '_predict_1_2'] = full_df[target_predict + '_lag_1'] + full_df[target_predict + '_delta_1_2']
    full_df[target_predict + '_predict_2_3'] = full_df[target_predict + '_lag_1'] + full_df[target_predict + '_delta_2_3']\
          + full_df[target_predict + '_predict_1_2']

In [38]:
np.array(full_df.columns)

array(['date_block_num', 'shop_id', 'item_id', 'item_price', 'target',
       'was_item_price_outlier', 'was_item_cnt_day_outlier',
       'item_category_id', 'general_item_category_name', 'city',
       'target_by_item_id_total', 'target_by_shop_id_total',
       'target_by_category_total', 'target_by_general_category_total',
       'target_by_city_total', 'target_by_item_id_mean',
       'target_by_shop_id_mean', 'target_by_category_mean',
       'target_by_general_category_mean', 'target_by_city_mean',
       'not_full_historical_data', 'first_month_item_id',
       'target_aggregated_mean_premonthes_item_id_shop_id',
       'target_aggregated_max_premonthes_item_id_shop_id',
       'target_aggregated_mean_premonthes_item_id',
       'target_aggregated_max_premonthes_item_id',
       'target_aggregated_mean_premonthes_shop_id',
       'target_aggregated_max_premonthes_shop_id', 'month', 'year',
       'item_id_was_in_test', 'shop_id_was_in_test', 'target_lag_1',
       'target_by_it

## Saving Final Data Frame

In [39]:
temp = full_df[full_df['date_block_num'] == 34] # Chose last month only

# Make a boolean mask for columns to delete which consist\
# of all 0 for all observations

leakege_true = list((temp == 0).sum() == 214200) 
leakege_true

del temp
gc.collect()

0

In [40]:
leakege_features = set(full_df.loc[:,leakege_true].columns)
leakege_features

{'item_id_was_in_test',
 'item_price',
 'not_full_historical_data',
 'shop_id_was_in_test',
 'target',
 'target_by_category_mean',
 'target_by_category_total',
 'target_by_city_mean',
 'target_by_city_total',
 'target_by_general_category_mean',
 'target_by_general_category_total',
 'target_by_item_id_mean',
 'target_by_item_id_total',
 'target_by_shop_id_mean',
 'target_by_shop_id_total',
 'was_item_cnt_day_outlier',
 'was_item_price_outlier'}

In [41]:
constant_features = {'target','item_id_was_in_test', 'shop_id_was_in_test','not_full_historical_data'}
leakege_features = list(leakege_features - constant_features)
    
leakege_features

['target_by_item_id_mean',
 'target_by_general_category_total',
 'was_item_cnt_day_outlier',
 'target_by_category_total',
 'target_by_city_total',
 'target_by_category_mean',
 'target_by_city_mean',
 'was_item_price_outlier',
 'target_by_shop_id_total',
 'target_by_general_category_mean',
 'target_by_shop_id_mean',
 'item_price',
 'target_by_item_id_total']

In [42]:
full_df = full_df.drop(columns=leakege_features, axis = 1)
full_df

Unnamed: 0,date_block_num,shop_id,item_id,target,item_category_id,general_item_category_name,city,not_full_historical_data,first_month_item_id,target_aggregated_mean_premonthes_item_id_shop_id,...,target_by_category_total_predict_1_2,target_by_category_total_predict_2_3,target_by_general_category_total_delta_1_2,target_by_general_category_total_delta_2_3,target_by_general_category_total_predict_1_2,target_by_general_category_total_predict_2_3,target_by_city_total_delta_1_2,target_by_city_total_delta_2_3,target_by_city_total_predict_1_2,target_by_city_total_predict_2_3
0,0,2,991,1.0,67,10,0,1,1,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2,1472,3.0,23,5,0,1,1,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,2,1905,3.0,30,5,0,1,1,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,2,2920,7.0,21,5,0,1,1,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,2,3320,4.0,19,5,0,1,1,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11027056,34,45,18454,0.0,55,9,20,0,0,0.818182,...,6719.0,11427.0,720.0,-862.0,7548.0,13514.0,61.0,-65.0,1520.0,2914.0
11027057,34,45,16188,0.0,64,10,20,0,0,0.000000,...,949.0,2625.0,3215.0,10414.0,16844.0,40887.0,61.0,1398.0,1520.0,4377.0
11027058,34,45,15757,0.0,55,9,20,0,0,0.205882,...,6719.0,11427.0,720.0,-862.0,7548.0,13514.0,61.0,-65.0,1520.0,2914.0
11027059,34,45,19648,0.0,40,7,20,0,0,0.000000,...,8562.0,14497.0,1674.0,-1910.0,14557.0,25530.0,61.0,-65.0,1520.0,2914.0


In [46]:
full_df.loc[full_df['date_block_num']==34, 'item_id_was_in_test'] = np.int8(1)
full_df.loc[full_df['date_block_num']==34, 'shop_id_was_in_test'] = np.int8(1)

In [43]:
full_df.columns[:50]

Index(['date_block_num', 'shop_id', 'item_id', 'target', 'item_category_id',
       'general_item_category_name', 'city', 'not_full_historical_data',
       'first_month_item_id',
       'target_aggregated_mean_premonthes_item_id_shop_id',
       'target_aggregated_max_premonthes_item_id_shop_id',
       'target_aggregated_mean_premonthes_item_id',
       'target_aggregated_max_premonthes_item_id',
       'target_aggregated_mean_premonthes_shop_id',
       'target_aggregated_max_premonthes_shop_id', 'month', 'year',
       'item_id_was_in_test', 'shop_id_was_in_test', 'target_lag_1',
       'target_by_item_id_total_lag_1', 'target_by_shop_id_total_lag_1',
       'target_by_category_total_lag_1',
       'target_by_general_category_total_lag_1', 'target_by_city_total_lag_1',
       'target_by_item_id_mean_lag_1', 'target_by_shop_id_mean_lag_1',
       'target_by_category_mean_lag_1',
       'target_by_general_category_mean_lag_1', 'target_by_city_mean_lag_1',
       'target_aggregated_me

In [47]:
full_df.shape[1]

123

In [None]:
full_df.to_parquet(config.get('features'), engine='pyarrow')

In [54]:
train = full_df[full_df['date_block_num'] != 34]
predict = full_df[full_df['date_block_num'] == 34]

In [55]:
train_x = train.drop('target', axis = 1)
train_y = train[['target']]

In [56]:
predict = predict.drop('target', axis = 1)

In [65]:
train_x.columns[0:20]

Index(['date_block_num', 'shop_id', 'item_id', 'item_category_id',
       'general_item_category_name', 'city', 'not_full_historical_data',
       'first_month_item_id',
       'target_aggregated_mean_premonthes_item_id_shop_id',
       'target_aggregated_max_premonthes_item_id_shop_id',
       'target_aggregated_mean_premonthes_item_id',
       'target_aggregated_max_premonthes_item_id',
       'target_aggregated_mean_premonthes_shop_id',
       'target_aggregated_max_premonthes_shop_id', 'month', 'year',
       'item_id_was_in_test', 'shop_id_was_in_test', 'target_lag_1',
       'target_by_item_id_total_lag_1'],
      dtype='object')

In [76]:
if train_x.shape[1] != 122:
    print("⚠️⚠️⚠️YOUR DATA_FRAME LOST SOME COLUMNS ⚠️⚠️⚠️")
else:
    print('✅✅✅Seems you have 122 columns in train_x which is a good result✅✅✅')

✅✅✅Seems you have 122 columns in train_x which is a good result✅✅✅


In [None]:
train_x.to_parquet(config.get('train_x'), engine='pyarrow')
train_y.to_parquet(config.get('train_y'), engine='pyarrow')
predict.to_parquet(config.get('inference'), engine='pyarrow')

In [None]:
train_test_x = pd.read_parquet(config.get('train_x'), engine='pyarrow')
train_test_y = pd.read_parquet(config.get('train_y'), engine='pyarrow')
predict_test= pd.read_parquet(config.get('inference'), engine='pyarrow')

In [77]:
train_test_x.columns[:20]

Index(['date_block_num', 'shop_id', 'item_id', 'item_category_id',
       'general_item_category_name', 'city', 'not_full_historical_data',
       'first_month_item_id',
       'target_aggregated_mean_premonthes_item_id_shop_id',
       'target_aggregated_max_premonthes_item_id_shop_id',
       'target_aggregated_mean_premonthes_item_id',
       'target_aggregated_max_premonthes_item_id',
       'target_aggregated_mean_premonthes_shop_id',
       'target_aggregated_max_premonthes_shop_id', 'month', 'year',
       'item_id_was_in_test', 'shop_id_was_in_test', 'target_lag_1',
       'target_by_item_id_total_lag_1'],
      dtype='object')

In [84]:
predict_test.columns[:20]

Index(['date_block_num', 'shop_id', 'item_id', 'item_category_id',
       'general_item_category_name', 'city', 'not_full_historical_data',
       'first_month_item_id',
       'target_aggregated_mean_premonthes_item_id_shop_id',
       'target_aggregated_max_premonthes_item_id_shop_id',
       'target_aggregated_mean_premonthes_item_id',
       'target_aggregated_max_premonthes_item_id',
       'target_aggregated_mean_premonthes_shop_id',
       'target_aggregated_max_premonthes_shop_id', 'month', 'year',
       'item_id_was_in_test', 'shop_id_was_in_test', 'target_lag_1',
       'target_by_item_id_total_lag_1'],
      dtype='object')

In [111]:
some = (predict_test == 0).sum() == 214200
print(some[:20])
#list((predict_test == 0).sum() == 214200) 

date_block_num                                       False
shop_id                                              False
item_id                                              False
item_category_id                                     False
general_item_category_name                           False
city                                                 False
not_full_historical_data                              True
first_month_item_id                                  False
target_aggregated_mean_premonthes_item_id_shop_id    False
target_aggregated_max_premonthes_item_id_shop_id     False
target_aggregated_mean_premonthes_item_id            False
target_aggregated_max_premonthes_item_id             False
target_aggregated_mean_premonthes_shop_id            False
target_aggregated_max_premonthes_shop_id             False
month                                                False
year                                                 False
item_id_was_in_test                                  Fal

In [80]:
train_test_y

Unnamed: 0,target
0,1.0
1,3.0
2,3.0
3,7.0
4,4.0
...,...
10812856,0.0
10812857,0.0
10812858,0.0
10812859,0.0


In [115]:
%reset -f # to clean memory

Don't know how to reset  #, please run `%reset?` for details
Don't know how to reset  to, please run `%reset?` for details
Don't know how to reset  clean, please run `%reset?` for details
Don't know how to reset  memory, please run `%reset?` for details


In [None]:

# full_df[:20000].to_csv('../data/interim/full_df_final.csv', index = False)