In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
plt.style.use('ggplot')

import plotly.express as px

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.subplots import make_subplots
import gc
import joblib
import os
import pickle

  import pandas.util.testing as tm


In [2]:
with open('df.pickle','rb') as f:
    df = pickle.load(f)

### 5. Feature Engineering
Time Series data must be re-framed as a supervised learning dataset before we can start using machine learning algorithms.

There is no concept of input and output features in time series. Instead, we must choose the variable to be predicted and use feature engineering to construct all of the inputs that will be used to make predictions for future time steps.

The goal of feature engineering is to provide strong and ideally simple relationships between new input features and the output feature for the supervised learning algorithm to model.

<h5>5.1 Label Encoding</h5>
1. Remove unwanted data to create space in RAM for further processing.<br>
2. Label Encode categorical features.(I had converted already converted categorical variable to category type. So, I can simply use their codes instead of using LableEncoder)<br>
3. Remove date as its features are already present

Please note:
I'm storing the categories correponding to their respective category codes so that I'can use them later on while making the submission.

개수 많을 때 dict형태로 정리하기! :https://cnpnote.tistory.com/entry/PYTHON-%ED%8C%90%EB%8B%A4-Pandas-%EC%B9%B4%ED%85%8C%EA%B3%A0%EB%A6%AC%EB%A5%BC-%EC%88%AB%EC%9E%90%EB%A1%9C-%EB%B3%80%ED%99%98

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60034810 entries, 0 to 60034809
Data columns (total 23 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            category      
 1   item_id       category      
 2   dept_id       category      
 3   cat_id        category      
 4   store_id      category      
 5   state_id      category      
 6   d             object        
 7   sold          int16         
 8   date          datetime64[ns]
 9   wm_yr_wk      int16         
 10  weekday       category      
 11  wday          int8          
 12  month         int8          
 13  year          int16         
 14  event_name_1  category      
 15  event_type_1  category      
 16  event_name_2  category      
 17  event_type_2  category      
 18  snap_CA       int8          
 19  snap_TX       int8          
 20  snap_WI       int8          
 21  sell_price    float16       
 22  revenue       float32       
dtypes: category(11), datetime64[ns

In [4]:
%%time
df_sam=df.sample(frac = 0.3)

Wall time: 1min 53s


In [5]:
df_sam.shape

(18010443, 23)

In [6]:
%%time
df_sam.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18010443 entries, 940455 to 24105304
Data columns (total 23 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            category      
 1   item_id       category      
 2   dept_id       category      
 3   cat_id        category      
 4   store_id      category      
 5   state_id      category      
 6   d             object        
 7   sold          int16         
 8   date          datetime64[ns]
 9   wm_yr_wk      int16         
 10  weekday       category      
 11  wday          int8          
 12  month         int8          
 13  year          int16         
 14  event_name_1  category      
 15  event_type_1  category      
 16  event_name_2  category      
 17  event_type_2  category      
 18  snap_CA       int8          
 19  snap_TX       int8          
 20  snap_WI       int8          
 21  sell_price    float16       
 22  revenue       float32       
dtypes: category(11), datetime

%%time
df_sam.id = df_sam.id.astype('category')
df_sam.item_id = df_sam.item_id.astype('category')
df_sam.dept_id = df_sam.dept_id.astype('category')
df_sam.cat_id = df_sam.cat_id.astype('category')
df_sam.store_id = df_sam.store_id.astype('category')
df_sam.state_id = df_sam.state_id.astype('category')

In [7]:
%%time
# Store the categories along with their codes - 개수 많을 때 dict형태로 정리하기!
d_sam_id = dict(zip(df_sam.id.cat.codes,df_sam.id))

Wall time: 8.73 s


In [8]:
d_sam_item_id = dict(zip(df_sam.item_id.cat.codes,df_sam.item_id))
d_sam_dept_id = dict(zip(df_sam.dept_id.cat.codes,df_sam.dept_id))
d_sam_cat_id = dict(zip(df_sam.cat_id.cat.codes,df_sam.cat_id))
d_sam_store_id = dict(zip(df_sam.store_id.cat.codes,df_sam.store_id))
d_sam_state_id = dict(zip(df_sam.state_id.cat.codes,df_sam.state_id))

In [10]:
#1 
#del group, group_price_cat,group_price_store,group_state,group_state_store,cal_data
#gc.collect()

#2
df_sam.d = df_sam['d'].apply(lambda x : str(x).split('_')[1]).astype(np.int16)
cols = df_sam.dtypes.index.tolist()
types = df_sam.dtypes.values.tolist()
for i,type in enumerate(types):
    if type.name == 'category':
        df_sam[cols[i]] = df_sam[cols[i]].cat.codes
        
        
# 3
df_sam.drop('date',axis = 1,inplace = True)

#### 5.2 Introduce Lags
Lag features are the classical way that time series forecasting problems are transformed into supervised learning problems.

Introduce lags to the the target variable sold. The maximum lag I have introduced is 36 days. It's purely upto you how many lags you want to introduce.

In [11]:
%%time
#Introduce lags
lags = [1,2,3,6,12,24,36]
for lag in lags:
    df['sold_lag_'+str(lag)] = df.groupby(['id','item_id','dept_id','cat_id','store_id','state_id'],as_index = False)['sold'].shift(lag).astype(np.float16)

#### 5.3 Mean Encoding
From a mathematical point of view, mean encoding represents a probability of your target variable, conditional on each value of the feature. In a way, it embodies the target variable in its encoded value. I have calculated mean encodings on the basis of following logical features I could think of:-

- item
- state
- store
- category
- department
- category & department
- store & item
- category & item
- department & item
- state & store
- state, store and category
- store, category and department

In [12]:
%%time
df_sam['iteam_sold_avg'] = df_sam.groupby('item_id')['sold'].transform('mean').astype(np.float16)
df_sam['state_sold_avg'] = df_sam.groupby('state_id')['sold'].transform('mean').astype(np.float16)
df_sam['store_sold_avg'] = df_sam.groupby('store_id')['sold'].transform('mean').astype(np.float16)
df_sam['cat_sold_avg'] = df_sam.groupby('cat_id')['sold'].transform('mean').astype(np.float16)
df_sam['dept_sold_avg'] = df_sam.groupby('dept_id')['sold'].transform('mean').astype(np.float16)
df_sam['cat_dept_sold_avg'] = df_sam.groupby(['cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
df_sam['store_item_sold_avg'] = df_sam.groupby(['store_id','item_id'])['sold'].transform('mean').astype(np.float16)
df_sam['cat_item_sold_avg'] = df_sam.groupby(['cat_id','item_id'])['sold'].transform('mean').astype(np.float16)
df_sam['dept_item_sold_avg'] = df_sam.groupby(['dept_id','item_id'])['sold'].transform('mean').astype(np.float16)
df_sam['state_store_sold_avg'] = df_sam.groupby(['state_id','store_id'])['sold'].transform('mean').astype(np.float16)
df_sam['state_store_cat_sold_avg'] = df_sam.groupby(['state_id','store_id','cat_id'])['sold'].transform('mean').astype(np.float16)
df_sam['store_cat_dept_sold_avg'] = df_sam.groupby(['store_id','cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)

Wall time: 30.6 s


In [15]:
df_sam.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sold,wm_yr_wk,weekday,...,store_sold_avg,cat_sold_avg,dept_sold_avg,cat_dept_sold_avg,store_item_sold_avg,cat_item_sold_avg,dept_item_sold_avg,state_store_sold_avg,state_store_cat_sold_avg,store_cat_dept_sold_avg
940455,28008,2800,6,2,8,2,31,0,11105,1,...,1.117188,0.716797,0.300293,0.300293,0.171143,0.284424,0.284424,1.117188,0.69873,0.192139
45300362,28067,2806,6,2,7,2,1486,0,11504,3,...,0.875977,0.716797,0.300293,0.300293,0.15918,0.119324,0.119324,0.875977,0.520996,0.233521
27065154,6186,618,2,0,6,1,888,0,11323,4,...,1.036133,1.624023,2.035156,2.035156,0.743164,0.793457,0.793457,1.036133,1.500977,1.893555
17912770,13324,1332,2,0,4,1,588,0,11232,0,...,0.949219,1.624023,2.035156,2.035156,0.427246,1.180664,1.180664,0.949219,1.356445,1.739258
31079999,29793,2979,6,2,3,0,1020,1,11342,6,...,0.695801,0.716797,0.300293,0.300293,0.28418,0.808105,0.808105,0.695801,0.356689,0.19043


## 5.4 Rolling Window Statistic

This method is called the rolling window method because the window would be different for every data point.

I'll be calculating weekly rolling avearge of the items sold. More features like rolling min, max or sum can also be calculated. Also, same features can be calculated for revenue as well.

In [21]:
%%time
df_sam['rolling_sold_mean'] = df_sam.groupby(['id','item_id','dept_id','cat_id','store_id','state_id'])['sold'].transform(lambda x : x.rolling(window =7).mean()).astype(np.float16)

Wall time: 1min 19s


## 5.5 Expanding Window Statistics

This is simply an advanced version of the rolling window technique. In the case of a rolling window, the size of the window is constant while the window slides as we move forward in time. Hence, we consider only the most recent values and ignore the past values. Here’s a gif that explains how our expanding window function works

I'll be calculating expanding avearge of the items sold. More features like expanding min, max or sum can also be calculated. Also, same features can be calculated for revenue as well.

In [22]:
%%time
df_sam['expanding_sold_mean'] = df_sam.groupby(['id','item_id','dept_id','cat_id','store_id','state_id'])['sold'].transform(lambda x : x.expanding(2).mean()).astype(np.float16)

Wall time: 1min 10s


## 5.6 Trends

I will be creating a selling trend feature, which will be some positive value if the daily items sold are greater than the entire duration average ( d_1 - d_1969 ) else negative. More trend features can be added but I'll only add this one to keep it simple.

In [23]:
df_sam['daily_avg_sold'] = df_sam.groupby(['id','item_id','dept_id','cat_id','store_id','state_id','d'])['sold'].transform('mean').astype(np.float16)
df_sam['avg_sold'] = df.groupby(['id','item_id','dept_id','cat_id','store_id','state_id'])['sold'].transform('mean').astype(np.float16)
df_sam['selling_trend'] = (df_sam['daily_avg_sold'] - df_sam['avg_sold']).astype(np.float16)
df_sam.drop(['daily_avg_sold','avg_sold'],axis = 1,inplace = True)

## 5.7 Save the data

Now since all the new features are created, let's save the data so that it can be trained separately.Also, lags introduce a lot of Null values, so I'll remove data for first 35 days as I have introduced lags till 36 days.

In [24]:
%%time
df_sam = df_sam[df_sam['d']>=36]

Wall time: 13.1 s


In [25]:
df_sam.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17690827 entries, 45300362 to 24105304
Data columns (total 37 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   id                        int16  
 1   item_id                   int16  
 2   dept_id                   int8   
 3   cat_id                    int8   
 4   store_id                  int8   
 5   state_id                  int8   
 6   d                         int16  
 7   sold                      int16  
 8   wm_yr_wk                  int16  
 9   weekday                   int8   
 10  wday                      int8   
 11  month                     int8   
 12  year                      int16  
 13  event_name_1              int8   
 14  event_type_1              int8   
 15  event_name_2              int8   
 16  event_type_2              int8   
 17  snap_CA                   int8   
 18  snap_TX                   int8   
 19  snap_WI                   int8   
 20  sell_price     

In [29]:
gc.collect();

In [31]:
df_sam.to_csv('Input/data_sam.csv')