In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from downcast import reduce
import warnings
from plotly.io import to_image
from IPython.display import Image
import pickle


warnings.filterwarnings('ignore')
%matplotlib inline

#### <a class="page-link" href='#read_pickle'>Ler /data/processed/fe1_data.pkl existente (clique aqui)</a>

In [4]:
prices = pd.read_pickle("../data/processed/item_prices.pkl")
cal = pd.read_pickle("../data/processed/daily_calendar_with_events.pkl")
sales_eval = pd.read_pickle('../data/processed/item_sales.pkl')

In [3]:
# Passar da Wide form para Long form
data = sales_eval.melt(id_vars=['id','item', 'category', 'department', 'store_code', 'region'], value_vars=[i for i in sales_eval if i.startswith('d_')], var_name=['d'], value_name='sales')
data.head()

Unnamed: 0,id,item,category,department,store_code,region,d,sales
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0
1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0
2,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0
3,ACCESORIES_1_004_NYC_1,ACCESORIES_1_004,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0
4,ACCESORIES_1_005_NYC_1,ACCESORIES_1_005,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0


In [4]:
prices['yearweek'] = prices['yearweek'].astype('Int64')
for i in range(len(prices)):
   if pd.isnull(prices['yearweek'][i]):
       prices['yearweek'][i]=prices['yearweek'][i-1]+1

In [6]:
prices.dtypes

item          category
category      category
store_code    category
yearweek         Int64
sell_price     float16
dtype: object

In [7]:
prices['item'] = prices['item'].astype('string')
prices['category'] = prices['category'].astype('string')
prices['store_code'] = prices['store_code'].astype('string')
prices['yearweek'] = prices['yearweek'].astype('Int64').astype('string')

In [8]:
data['item'] = data['item'].astype('string')
data['category'] = data['category'].astype('string')
data['store_code'] = data['store_code'].astype('string')

In [9]:
data.head()

Unnamed: 0,id,item,category,department,store_code,region,d,sales
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0
1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0
2,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0
3,ACCESORIES_1_004_NYC_1,ACCESORIES_1_004,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0
4,ACCESORIES_1_005_NYC_1,ACCESORIES_1_005,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0


In [5]:
merged = pd.merge(data, cal, on='d', how='left')

In [11]:
merged.head()

Unnamed: 0,id,item,category,department,store_code,region,d,sales,date,weekday,weekday_int,event
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0,2011-01-29,Saturday,1,
1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0,2011-01-29,Saturday,1,
2,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0,2011-01-29,Saturday,1,
3,ACCESORIES_1_004_NYC_1,ACCESORIES_1_004,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0,2011-01-29,Saturday,1,
4,ACCESORIES_1_005_NYC_1,ACCESORIES_1_005,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0,2011-01-29,Saturday,1,


In [9]:
merged['date'] = pd.to_datetime(merged['date'])
merged['year'] = merged['date'].dt.year
merged['month'] = merged['date'].dt.month
merged['day'] = merged['date'].dt.day
merged['week'] = merged['date'].dt.isocalendar().week.apply(str)
merged['week'] = merged['week'].apply(lambda x: x.zfill(2))
merged['yearweek'] = merged['year'].apply(str) + merged['week']

ValueError: invalid literal for int() with base 10: 'd_1'

In [8]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58327370 entries, 0 to 58327369
Data columns (total 17 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           category      
 1   item         category      
 2   category     category      
 3   department   category      
 4   store_code   category      
 5   region       category      
 6   d            object        
 7   sales        int16         
 8   date         datetime64[ns]
 9   weekday      category      
 10  weekday_int  int8          
 11  event        category      
 12  year         int64         
 13  month        int64         
 14  day          int64         
 15  week         object        
 16  yearweek     object        
dtypes: category(8), datetime64[ns](1), int16(1), int64(3), int8(1), object(3)
memory usage: 4.2+ GB


In [15]:
prices.head()

Unnamed: 0,item,category,store_code,yearweek,sell_price
0,ACCESORIES_1_001,ACCESORIES,NYC_1,201328,12.742188
1,ACCESORIES_1_001,ACCESORIES,NYC_1,201329,12.742188
2,ACCESORIES_1_001,ACCESORIES,NYC_1,201330,10.984375
3,ACCESORIES_1_001,ACCESORIES,NYC_1,201331,10.984375
4,ACCESORIES_1_001,ACCESORIES,NYC_1,201332,10.984375


In [13]:
prices['yearweek'] = prices['yearweek'].astype('object')

In [16]:
merged2 = pd.merge(merged, prices, on=['store_code','item','yearweek'], how='left')

In [18]:
merged2=merged2.drop(['category_y'],axis=1)

In [19]:
merged2 = merged2.rename(columns={'category_x':'category'})

In [22]:
data = merged2.copy()

In [21]:
data.head()

Unnamed: 0,id,item,category,department,store_code,region,d,sales,date,weekday,weekday_int,event,year,month,day,week,yearweek,sell_price
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0,2011-01-29,Saturday,1,,2011,1,29,4,201104,
1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0,2011-01-29,Saturday,1,,2011,1,29,4,201104,
2,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0,2011-01-29,Saturday,1,,2011,1,29,4,201104,
3,ACCESORIES_1_004_NYC_1,ACCESORIES_1_004,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0,2011-01-29,Saturday,1,,2011,1,29,4,201104,
4,ACCESORIES_1_005_NYC_1,ACCESORIES_1_005,ACCESORIES,ACCESORIES_1,NYC_1,New York,d_1,0,2011-01-29,Saturday,1,,2011,1,29,4,201104,


In [23]:
data["sell_price"].fillna(data.groupby("id")["sell_price"].transform("mean"), inplace=True)
data['d'] = data['d'].apply(lambda a: a.split('_')[1]).astype(np.int16)
data.drop(columns=["date","weekday"], inplace=True)
data.head()

Unnamed: 0,id,item,category,department,store_code,region,d,sales,weekday_int,event,year,month,day,week,yearweek,sell_price
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,NYC_1,New York,1,0,1,,2011,1,29,4,201104,11.010936
1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,ACCESORIES_1,NYC_1,New York,1,0,1,,2011,1,29,4,201104,5.28125
2,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,ACCESORIES_1,NYC_1,New York,1,0,1,,2011,1,29,4,201104,3.949219
3,ACCESORIES_1_004_NYC_1,ACCESORIES_1_004,ACCESORIES,ACCESORIES_1,NYC_1,New York,1,0,1,,2011,1,29,4,201104,6.019237
4,ACCESORIES_1_005_NYC_1,ACCESORIES_1_005,ACCESORIES,ACCESORIES_1,NYC_1,New York,1,0,1,,2011,1,29,4,201104,3.91419


In [24]:
import pickle
data.to_pickle('../data/processed/fe1_data.pkl')

### <a id='read_pickle'>Read Pickle fe1_data.pkl</a>

In [3]:
import pickle
data = pd.read_pickle("../data/processed/fe1_data.pkl")

### Label Encoding

In [5]:
d_id = dict(zip(sales_eval.id.cat.codes, sales_eval.id))
d_item_id = dict(zip(sales_eval.item.cat.codes, sales_eval.item))
d_department_id = dict(zip(sales_eval.department.cat.codes, sales_eval.department))
d_category_id = dict(zip(sales_eval.category.cat.codes, sales_eval.category))
d_store_id = dict(zip(sales_eval.store_code.cat.codes, sales_eval.store_code))
d_region_id = dict(zip(sales_eval.region.cat.codes, sales_eval.region))

In [4]:
data['item'] = data['item'].astype('category')
data['category'] = data['category'].astype('category')
data['store_code'] = data['store_code'].astype('category')
data['d'] = data['d'].astype('category')

data["event"] = data["event"].astype('object')
data["event"] = data["event"].replace(np.nan, 'NO_EVENT')
data["event"] = data["event"].astype('category')

In [5]:
data.head()

Unnamed: 0,id,item,category,department,store_code,region,d,sales,weekday_int,event,year,month,day,week,yearweek,sell_price
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,NYC_1,New York,1,0,1,NO_EVENT,2011,1,29,4,201104,11.010936
1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,ACCESORIES_1,NYC_1,New York,1,0,1,NO_EVENT,2011,1,29,4,201104,5.28125
2,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,ACCESORIES_1,NYC_1,New York,1,0,1,NO_EVENT,2011,1,29,4,201104,3.949219
3,ACCESORIES_1_004_NYC_1,ACCESORIES_1_004,ACCESORIES,ACCESORIES_1,NYC_1,New York,1,0,1,NO_EVENT,2011,1,29,4,201104,6.019237
4,ACCESORIES_1_005_NYC_1,ACCESORIES_1_005,ACCESORIES,ACCESORIES_1,NYC_1,New York,1,0,1,NO_EVENT,2011,1,29,4,201104,3.91419


In [6]:
cols = data.dtypes.index.tolist()
d_types = data.dtypes.values.tolist()

for i,type in enumerate(d_types):
    if type.name == 'category':
        data[cols[i]] = data[cols[i]].cat.codes

In [7]:
data.head()

Unnamed: 0,id,item,category,department,store_code,region,d,sales,weekday_int,event,year,month,day,week,yearweek,sell_price
0,3,0,0,0,3,1,0,0,1,1,2011,1,29,4,201104,11.010936
1,13,1,0,0,3,1,0,0,1,1,2011,1,29,4,201104,5.28125
2,23,2,0,0,3,1,0,0,1,1,2011,1,29,4,201104,3.949219
3,33,3,0,0,3,1,0,0,1,1,2011,1,29,4,201104,6.019237
4,43,4,0,0,3,1,0,0,1,1,2011,1,29,4,201104,3.91419


In [8]:
#Intoducing lags and rolling features
#lag features

lags = [1,2,3,5,7,14,21,28]
for lag in lags:
    data["lag_" + str(lag)] = data.groupby("id")["sales"].shift(lag).astype(np.float16)

In [9]:
#rolling mean features

data['rolling_mean_10'] = data.groupby('id')['sales'].transform(lambda x: x.rolling(10).mean())
data['rolling_mean_20'] = data.groupby('id')['sales'].transform(lambda x: x.rolling(20).mean())
data['rolling_mean_30'] = data.groupby('id')['sales'].transform(lambda x: x.rolling(30).mean())

In [10]:
data['event'].unique()

array([1, 4, 3, 5, 2, 0], dtype=int8)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58327370 entries, 0 to 58327369
Data columns (total 27 columns):
 #   Column           Dtype  
---  ------           -----  
 0   id               int16  
 1   item             int16  
 2   category         int8   
 3   department       int8   
 4   store_code       int8   
 5   region           int8   
 6   d                int16  
 7   sales            int16  
 8   weekday_int      int8   
 9   event            int8   
 10  year             int64  
 11  month            int64  
 12  day              int64  
 13  week             object 
 14  yearweek         object 
 15  sell_price       float32
 16  lag_1            float16
 17  lag_2            float16
 18  lag_3            float16
 19  lag_5            float16
 20  lag_7            float16
 21  lag_14           float16
 22  lag_21           float16
 23  lag_28           float16
 24  rolling_mean_10  float64
 25  rolling_mean_20  float64
 26  rolling_mean_30  float64
dtypes: float16

In [25]:
data = data[data['d']>1000]
data.head()

Unnamed: 0,id,item,category,department,store_code,region,d,sales,date,weekday,...,lag_2,lag_3,lag_5,lag_7,lag_14,lag_21,lag_28,rolling_mean_10,rolling_mean_20,rolling_mean_30
30490,3,0,0,0,3,1,1025,0,2011-01-30,3,...,,,,,,,,,,
30491,13,1,0,0,3,1,1025,0,2011-01-30,3,...,,,,,,,,,,
30492,23,2,0,0,3,1,1025,0,2011-01-30,3,...,,,,,,,,,,
30493,33,3,0,0,3,1,1025,0,2011-01-30,3,...,,,,,,,,,,
30494,43,4,0,0,3,1,1025,0,2011-01-30,3,...,,,,,,,,,,


> Uma vez que o tamanho da base de dados é superior a 6.7GB, então iremos considerar uma amostra da presente base de dados.
Vamos considerar apenas os últimos 914 dias.

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27837370 entries, 30490000 to 58327369
Data columns (total 27 columns):
 #   Column           Dtype  
---  ------           -----  
 0   id               int16  
 1   item             string 
 2   category         string 
 3   department       int8   
 4   store_code       string 
 5   region           int8   
 6   d                int16  
 7   sales            int16  
 8   weekday_int      int8   
 9   event            int8   
 10  year             int64  
 11  month            int64  
 12  day              int64  
 13  week             object 
 14  yearweek         object 
 15  sell_price       float16
 16  lag_1            float16
 17  lag_2            float16
 18  lag_3            float16
 19  lag_5            float16
 20  lag_7            float16
 21  lag_14           float16
 22  lag_21           float16
 23  lag_28           float16
 24  rolling_mean_10  float64
 25  rolling_mean_20  float64
 26  rolling_mean_30  float64
dtypes: 

> Repare-se que ao considerarmos uma amostra o ficheiro ficou bem mais leve , 3.2GB.

In [12]:
data.to_pickle('../data/processed/fe2_data.pkl')

> Agora que realizamos os merges de todos os preços e dados das datas e introduzimos features de vendas anteriores como os lags e médias móveis estamos prontos para o estágio da modelação.