In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [51]:
from feature_engine.datetime import DatetimeFeatures
from feature_engine.timeseries.forecasting import LagFeatures, WindowFeatures, ExpandingWindowFeatures

In [52]:
data = pd.read_parquet('d:/demand-forecast-SQGroup/data/sales_bya_v2.parquet')

In [53]:
data.head().T

Unnamed: 0,0,1,2,3,4
cid,10001,10001,10001,10001,10001
item_type_id,186,186,186,186,186
category,1,1,1,1,1
date,2020-01-01 00:00:00,2020-01-01 00:00:00,2020-01-01 00:00:00,2020-01-01 00:00:00,2020-01-01 00:00:00
outlet_id,1003,1003,1003,1003,1003
wire,1,1,1,1,1
rm,1.0,1.0,1.0,1.0,1.0
fy,17,17,17,17,17
base_size,6,6,6,6,6
client_id,3DHEHE9HHE9H5MAK,3DHEHE9HHE9H9H1L,3DHEHE9HHEHE3DHE,3DHEHE9HHEHE4N7O,3DHEHE9HHEHEHEAK


In [54]:
df = data.copy()
df = df[['date', 'net_price', 'qtym']]

In [55]:
timeseries = df.groupby(by=['date']).sum().reset_index()

In [56]:
timeseries

Unnamed: 0,date,net_price,qtym
0,2019-07-01,2.627232e+07,823941.875000
1,2019-07-02,1.510582e+07,528482.937500
2,2019-07-04,5.594980e+06,197854.296875
3,2019-07-06,9.242950e+06,335050.500000
4,2019-07-07,9.014580e+06,314290.968750
...,...,...,...
1144,2023-06-25,4.866168e+04,1648.000000
1145,2023-06-26,2.878786e+06,100510.070312
1146,2023-06-27,4.076503e+06,147443.796875
1147,2023-06-28,5.416796e+06,192013.406250


<center><h1>Tempoal Features</h1></center>

In [57]:
def add_tempoal_features(df: pd.DataFrame) -> bool:
    features_needed = [
        "month", "quarter", "quarter_start", "quarter_end", "year_start", "year_end",
        "week", "day_of_week", "day_of_month", "day_of_year", "weekend", "month_start",
        "month_end"        
    ]
    try:
        extractor = DatetimeFeatures(
            variables=None, features_to_extract=features_needed, drop_original=True, 
            missing_values='raise', dayfirst=False, yearfirst=False, 
            utc=None, format=None
        )
        features = extractor.fit_transform(df)
        
        ###
        for col in list(features.columns)[2:]:
            df[col] = features[col].astype('int32').values
        
        return True
    except Exception:
        return False
    

In [60]:
extractor = DatetimeFeatures(
            variables=None, features_to_extract=["month", "quarter", "quarter_start", "quarter_end", "year_start", "year_end"], drop_original=True, 
            missing_values='raise', dayfirst=False, yearfirst=False, 
            utc=None, format=None
        )
extractor.fit_transform(timeseries)

Unnamed: 0,net_price,qtym,date_month,date_quarter,date_quarter_start,date_quarter_end,date_year_start,date_year_end
0,2.627232e+07,823941.875000,7,3,1,0,0,0
1,1.510582e+07,528482.937500,7,3,0,0,0,0
2,5.594980e+06,197854.296875,7,3,0,0,0,0
3,9.242950e+06,335050.500000,7,3,0,0,0,0
4,9.014580e+06,314290.968750,7,3,0,0,0,0
...,...,...,...,...,...,...,...,...
1144,4.866168e+04,1648.000000,6,2,0,0,0,0
1145,2.878786e+06,100510.070312,6,2,0,0,0,0
1146,4.076503e+06,147443.796875,6,2,0,0,0,0
1147,5.416796e+06,192013.406250,6,2,0,0,0,0


In [61]:
timeseries

Unnamed: 0,date,net_price,qtym
0,2019-07-01,2.627232e+07,823941.875000
1,2019-07-02,1.510582e+07,528482.937500
2,2019-07-04,5.594980e+06,197854.296875
3,2019-07-06,9.242950e+06,335050.500000
4,2019-07-07,9.014580e+06,314290.968750
...,...,...,...
1144,2023-06-25,4.866168e+04,1648.000000
1145,2023-06-26,2.878786e+06,100510.070312
1146,2023-06-27,4.076503e+06,147443.796875
1147,2023-06-28,5.416796e+06,192013.406250


In [62]:
add_tempoal_features(timeseries)

True

In [63]:
timeseries.head().T

Unnamed: 0,0,1,2,3,4
date,2019-07-01 00:00:00,2019-07-02 00:00:00,2019-07-04 00:00:00,2019-07-06 00:00:00,2019-07-07 00:00:00
net_price,26272324.0,15105817.0,5594980.5,9242950.0,9014580.0
qtym,823941.875,528482.9375,197854.296875,335050.5,314290.96875
date_month,7,7,7,7,7
date_quarter,3,3,3,3,3
date_quarter_start,1,0,0,0,0
date_quarter_end,0,0,0,0,0
date_year_start,0,0,0,0,0
date_year_end,0,0,0,0,0
date_week,27,27,27,27,27


In [64]:
timeseries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149 entries, 0 to 1148
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date                1149 non-null   datetime64[ns]
 1   net_price           1149 non-null   float32       
 2   qtym                1149 non-null   float32       
 3   date_month          1149 non-null   int32         
 4   date_quarter        1149 non-null   int32         
 5   date_quarter_start  1149 non-null   int32         
 6   date_quarter_end    1149 non-null   int32         
 7   date_year_start     1149 non-null   int32         
 8   date_year_end       1149 non-null   int32         
 9   date_week           1149 non-null   int32         
 10  date_day_of_week    1149 non-null   int32         
 11  date_day_of_month   1149 non-null   int32         
 12  date_day_of_year    1149 non-null   int32         
 13  date_weekend        1149 non-null   int32       

<center><h1>Lag Features</h1></center>

In [65]:
def add_lag_features(df: pd.DataFrame) -> bool:
    
    try:
        lag = LagFeatures(
            variables=None, 
            periods=[1, 3, 7, 14, 28], freq=None, sort_index=True, 
            missing_values='raise', drop_original=False
        )
        
        features = lag.fit_transform(df[['date','net_price', 'qtym']])
        
        ###
        for col in list(features.columns)[2:]:
            df[col] = features[col].values
        
        return True
    except Exception:
        return False
    

In [67]:
lag = LagFeatures(variables=None, periods=[1, 3, 7, 14, 28], freq=None, sort_index=True, missing_values='raise', drop_original=False)
lag.fit_transform(timeseries[['date','net_price', 'qtym']]).head().T

Unnamed: 0,0,1,2,3,4
date,2019-07-01 00:00:00,2019-07-02 00:00:00,2019-07-04 00:00:00,2019-07-06 00:00:00,2019-07-07 00:00:00
net_price,26272324.0,15105817.0,5594980.5,9242950.0,9014580.0
qtym,823941.875,528482.9375,197854.296875,335050.5,314290.96875
net_price_lag_1,,26272324.0,15105817.0,5594980.5,9242950.0
qtym_lag_1,,823941.875,528482.9375,197854.296875,335050.5
net_price_lag_3,,,,26272324.0,15105817.0
qtym_lag_3,,,,823941.875,528482.9375
net_price_lag_7,,,,,
qtym_lag_7,,,,,
net_price_lag_14,,,,,


In [68]:
add_lag_features(timeseries)

True

In [69]:
timeseries.head().T

Unnamed: 0,0,1,2,3,4
date,2019-07-01 00:00:00,2019-07-02 00:00:00,2019-07-04 00:00:00,2019-07-06 00:00:00,2019-07-07 00:00:00
net_price,26272324.0,15105817.0,5594980.5,9242950.0,9014580.0
qtym,823941.875,528482.9375,197854.296875,335050.5,314290.96875
date_month,7,7,7,7,7
date_quarter,3,3,3,3,3
date_quarter_start,1,0,0,0,0
date_quarter_end,0,0,0,0,0
date_year_start,0,0,0,0,0
date_year_end,0,0,0,0,0
date_week,27,27,27,27,27


<center><h1>Window Features</h1></center>

In [70]:
def add_window_features(df:pd.DataFrame) -> pd.DataFrame:
    window = WindowFeatures(
        variables=None, window=7, min_periods=7, 
        functions=['mean', 'std', 'median'], periods=7, freq=None, sort_index=True, 
        missing_values='raise', drop_original=False
    )
    features = window.fit_transform(df[['date','net_price', 'qtym']])
    for col in list(features.columns)[3:]:
        df[col] = features[col].values

In [71]:
add_window_features(timeseries)

In [72]:
timeseries.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
date,2019-07-01 00:00:00,2019-07-02 00:00:00,2019-07-04 00:00:00,2019-07-06 00:00:00,2019-07-07 00:00:00,2019-07-09 00:00:00,2019-07-10 00:00:00,2019-07-11 00:00:00,2019-07-13 00:00:00,2019-07-15 00:00:00
net_price,26272324.0,15105817.0,5594980.5,9242950.0,9014580.0,10005522.0,6434367.5,6246905.0,9816375.0,10813265.0
qtym,823941.875,528482.9375,197854.296875,335050.5,314290.96875,330534.34375,241389.828125,232441.0625,352790.5625,364290.0
date_month,7,7,7,7,7,7,7,7,7,7
date_quarter,3,3,3,3,3,3,3,3,3,3
date_quarter_start,1,0,0,0,0,0,0,0,0,0
date_quarter_end,0,0,0,0,0,0,0,0,0,0
date_year_start,0,0,0,0,0,0,0,0,0,0
date_year_end,0,0,0,0,0,0,0,0,0,0
date_week,27,27,27,27,27,28,28,28,28,29


<center><h1>Expanding Features</h1></center>

In [73]:
def add_exp_window_features(df:pd.DataFrame) -> pd.DataFrame:
    expwindow = ExpandingWindowFeatures(
        variables=None, min_periods=7, functions='std', 
        periods=7, freq=None, sort_index=True, 
        missing_values='raise', drop_original=False
    )
    features = expwindow.fit_transform(df[['date', 'net_price', 'qtym']])
    
    ### 
    for col in list(features.columns)[3:]:
        df[col] = features[col].values

In [74]:
add_exp_window_features(timeseries)

In [75]:
timeseries.head().T

Unnamed: 0,0,1,2,3,4
date,2019-07-01 00:00:00,2019-07-02 00:00:00,2019-07-04 00:00:00,2019-07-06 00:00:00,2019-07-07 00:00:00
net_price,26272324.0,15105817.0,5594980.5,9242950.0,9014580.0
qtym,823941.875,528482.9375,197854.296875,335050.5,314290.96875
date_month,7,7,7,7,7
date_quarter,3,3,3,3,3
date_quarter_start,1,0,0,0,0
date_quarter_end,0,0,0,0,0
date_year_start,0,0,0,0,0
date_year_end,0,0,0,0,0
date_week,27,27,27,27,27


In [76]:
df.to_parquet('d:/demand-forecast-SQGroup/data/sales_BYA_v3.parquet', index=False)