# Step 2: Feature engineering

In this notebook we will create a pipeline of transformers to create our features from the base dataset. We will store this pipeline and re-use it to create our features at inference time (i.e., when we want to make a forecast).

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

# Config paths

Specify the paths and directories from which we will read and write data to. In practice we would store this in a separate file rather than duplicate it across all notebooks. For simplicity, we specify the paths in the notebook itself.

In [2]:
# Directory containing the raw data.
data_sources = Path("../data_sources")

# Directory containing our processed data (i.e., the base dataset
# ready for feature engineering.
processed_data_dir = Path("../processed_data")

# Artifacts directory for storing the
# training data, models, pipelines etc.
artifacts_dir = Path("../artifacts")
training_dir = artifacts_dir / "training" # Directory to store our features for training.
pipeline_dir = artifacts_dir / "pipeline" # Directory to store our feature engineering pipeline.

# Create directories
for _dir in [artifacts_dir, training_dir, pipeline_dir]:
    _dir.mkdir(exist_ok=True)

# Load base dataset

In this tutorial we will load the data from one store only.

In [3]:
f_in = processed_data_dir / "data"
df = pd.read_parquet(
    path=f_in, 
    engine="pyarrow",
    filters=[
             ("store_id", "=", "CA_1"), # Only load this partition.
             ("date", ">=", pd.to_datetime("2012-01-01")) # Filter on date if
                                                          # desired.
            ]                           
)

In [4]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,state_id,y,date,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,store_id
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA,0,2013-07-13,no_event,no_event,no_event,no_event,0,1,0,9.58,CA_1
1,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA,0,2013-07-14,no_event,no_event,no_event,no_event,0,0,1,9.58,CA_1
2,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA,0,2013-07-15,no_event,no_event,no_event,no_event,0,1,1,9.58,CA_1
3,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA,0,2013-07-16,no_event,no_event,no_event,no_event,0,0,0,9.58,CA_1
4,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA,0,2013-07-17,no_event,no_event,no_event,no_event,0,0,0,9.58,CA_1


Remove unused categories to avoid groupby operations inside transformers returning a result for unobserved categories. For more information you can read some of the [issues](https://github.com/pandas-dev/pandas/issues/17631) on GitHub.

In [5]:
df["id"] = df["id"].cat.remove_unused_categories()

To handle panel time seres (i.e., multiple time series) with sktime we can represent it by setting the index as follows:

In [6]:
df = df.set_index(["id", "date"]).sort_index()
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_id,dept_id,cat_id,state_id,y,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,store_id
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
HOBBIES_1_001_CA_1_evaluation,2013-07-13,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA,0,no_event,no_event,no_event,no_event,0,1,0,9.58,CA_1
HOBBIES_1_001_CA_1_evaluation,2013-07-14,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA,0,no_event,no_event,no_event,no_event,0,0,1,9.58,CA_1
HOBBIES_1_001_CA_1_evaluation,2013-07-15,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA,0,no_event,no_event,no_event,no_event,0,1,1,9.58,CA_1
HOBBIES_1_001_CA_1_evaluation,2013-07-16,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA,0,no_event,no_event,no_event,no_event,0,0,0,9.58,CA_1
HOBBIES_1_001_CA_1_evaluation,2013-07-17,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA,0,no_event,no_event,no_event,no_event,0,0,0,9.58,CA_1


In [7]:
df.shape

(4242782, 14)

# Let's look at each individual transformer

## Datetime features

These features help capture any seasonality associated with the calendar. 
The `year` feature can also help capture any year-to-year changes or trends in the data.


Examples:
- day of month
- week of month
- week of year
- month of year
- day of week
- is weekend
- year

In [8]:
from sktime.transformations.series.date import DateTimeFeatures

In [9]:
# Datetime transformer
datetime_features = [
    "day_of_week",
    "is_weekend",
    "day_of_month",
    "week_of_month",
    "week_of_year",
    "month_of_year",
    "year",
]

datetime_trafo = DateTimeFeatures(manual_selection=datetime_features,
                                  keep_original_columns=False)

 is `True`. In future releases this will be changed 
 to `False`. To keep the current behaviour explicitly 
 set `keep_original_columns=True`.


Let's show how transformer behaves on a subset of the data.

In [10]:
datetime_trafo.fit(df.head())
df_result = datetime_trafo.transform(df.head())
df_result

 is `True`. In future releases this will be changed 
 to `False`. To keep the current behaviour explicitly 
 set `keep_original_columns=True`.


Unnamed: 0_level_0,Unnamed: 1_level_0,year,month_of_year,week_of_year,week_of_month,day_of_month,day_of_week,is_weekend
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
HOBBIES_1_001_CA_1_evaluation,2013-07-13,2013,7,28,2,13,5,1
HOBBIES_1_001_CA_1_evaluation,2013-07-14,2013,7,28,2,14,6,1
HOBBIES_1_001_CA_1_evaluation,2013-07-15,2013,7,29,3,15,0,0
HOBBIES_1_001_CA_1_evaluation,2013-07-16,2013,7,29,3,16,1,0
HOBBIES_1_001_CA_1_evaluation,2013-07-17,2013,7,29,3,17,2,0


## Time feature
- Time (or to be precise, time since earliest observation measured in days).

In [11]:
from sktime.transformations.series.time_since import TimeSince

In [12]:
# Time since transformer
time_since_trafo = TimeSince(start=["2000-01-01"],
                             freq="D", # The data has daily frequency
                             keep_original_columns=False)

Let's see how the transformer behaves on a subset of the data.

In [13]:
time_since_trafo.fit(df.head())
df_result = time_since_trafo.transform(df.head())
df_result.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,time_since_2000-01-01 00:00:00
id,date,Unnamed: 2_level_1
HOBBIES_1_001_CA_1_evaluation,2013-07-13,4942
HOBBIES_1_001_CA_1_evaluation,2013-07-14,4943
HOBBIES_1_001_CA_1_evaluation,2013-07-15,4944
HOBBIES_1_001_CA_1_evaluation,2013-07-16,4945
HOBBIES_1_001_CA_1_evaluation,2013-07-17,4946


## Lag & Window features

- Lag of the target
- Rolling mean
- Rolling standard deviation

In [14]:
from sktime.transformations.series.summarize import WindowSummarizer

In [15]:
lag_window_trafo = WindowSummarizer(
    lag_feature={
        "lag": [1, 2, 3, 7, 14, 28],  # Lag features.
        "mean": [[1, 7], [1, 14], [1, 28]],  # [[lag, window size]]
    },
    target_cols=["y", "sell_price"],
    truncate="bfill",  # Backfill missing values from lagging and windowing.
)

Let's see how the transformer behaves on a subset of the data. `WindowSummarizer` passes through all the other columns in the dataframe and drops the `target_cols` columns. 

In [16]:
lag_window_trafo.fit(df[["y", "sell_price"]].head(30))
df_result = lag_window_trafo.transform(df[["y", "sell_price"]].head(30))
df_result.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,y_lag_1,y_lag_2,y_lag_3,y_lag_7,y_lag_14,y_lag_28,y_mean_1_7,y_mean_1_14,y_mean_1_28,sell_price_lag_1,sell_price_lag_2,sell_price_lag_3,sell_price_lag_7,sell_price_lag_14,sell_price_lag_28,sell_price_mean_1_7,sell_price_mean_1_14,sell_price_mean_1_28
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
HOBBIES_1_001_CA_1_evaluation,2013-07-13,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.071429,0.178571,9.58,9.58,9.58,9.58,9.58,9.58,9.58,9.58,8.967143
HOBBIES_1_001_CA_1_evaluation,2013-07-14,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.071429,0.178571,9.58,9.58,9.58,9.58,9.58,9.58,9.58,9.58,8.967143
HOBBIES_1_001_CA_1_evaluation,2013-07-15,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.071429,0.178571,9.58,9.58,9.58,9.58,9.58,9.58,9.58,9.58,8.967143
HOBBIES_1_001_CA_1_evaluation,2013-07-16,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.071429,0.178571,9.58,9.58,9.58,9.58,9.58,9.58,9.58,9.58,8.967143
HOBBIES_1_001_CA_1_evaluation,2013-07-17,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.071429,0.178571,9.58,9.58,9.58,9.58,9.58,9.58,9.58,9.58,8.967143


## Static features

These are the features that are constant for a given time series, but differ between time series. For example:`item_id`, `dept_id`, `cat_id`, `state_id`. They can be treated as categorical features. Tree-based libraries like LightGBM and XGBoost can handle static features natively as categorical features. Linear models will require some feature engineering of these features.

In [17]:
df.loc[:, ["item_id", "dept_id", "cat_id", "state_id"]].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_id,dept_id,cat_id,state_id
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HOBBIES_1_001_CA_1_evaluation,2013-07-13,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA
HOBBIES_1_001_CA_1_evaluation,2013-07-14,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA
HOBBIES_1_001_CA_1_evaluation,2013-07-15,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA
HOBBIES_1_001_CA_1_evaluation,2013-07-16,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA
HOBBIES_1_001_CA_1_evaluation,2013-07-17,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA


# Promos

These features are the variables pre-fixed with `snap_`. These are currently present as binary variables (0: no promotion, 1: promotion). No further engineering is required here for either tree-based models or linear models.

In [18]:
df.filter(like="snap").head()

Unnamed: 0_level_0,Unnamed: 1_level_0,snap_CA,snap_TX,snap_WI
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HOBBIES_1_001_CA_1_evaluation,2013-07-13,0,1,0
HOBBIES_1_001_CA_1_evaluation,2013-07-14,0,0,1
HOBBIES_1_001_CA_1_evaluation,2013-07-15,0,1,1
HOBBIES_1_001_CA_1_evaluation,2013-07-16,0,0,0
HOBBIES_1_001_CA_1_evaluation,2013-07-17,0,0,0


# Special events

These features are the variables pre-fixed with `event_`. The `event_name` columns are more granular and refer to specific holidays and events (e.g., `"NBA Finals"`, `"Father's Day"`). The `event_type` columns are more aggregated descriptions of the event (e.g., `"Cultural"`, `"Religious"`). 

The data is provided effectively as a categorical variable. For tree-based models we could just leave these features as categorical.

In [19]:
for col in df.filter(like="event").columns:
    print(df[col].unique())

['no_event', 'Eid al-Fitr', 'LaborDay', 'ColumbusDay', 'EidAlAdha', ..., 'NBAFinalsEnd', 'Ramadan starts', 'IndependenceDay', 'OrthodoxEaster', 'Father's day']
Length: 31
Categories (31, object): ['Chanukah End', 'Christmas', 'Cinco De Mayo', 'ColumbusDay', ..., 'Thanksgiving', 'ValentinesDay', 'VeteransDay', 'no_event']
['no_event', 'Religious', 'National', 'Cultural', 'Sporting']
Categories (5, object): ['Cultural', 'National', 'Religious', 'Sporting', 'no_event']
['no_event', 'OrthodoxEaster', 'Father's day', 'Cinco De Mayo']
Categories (5, object): ['Cinco De Mayo', 'Easter', 'Father's day', 'OrthodoxEaster', 'no_event']
['no_event', 'Religious', 'Cultural']
Categories (3, object): ['Cultural', 'Religious', 'no_event']


In [20]:
df.filter(like="event").head()

Unnamed: 0_level_0,Unnamed: 1_level_0,event_name_1,event_type_1,event_name_2,event_type_2
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HOBBIES_1_001_CA_1_evaluation,2013-07-13,no_event,no_event,no_event,no_event
HOBBIES_1_001_CA_1_evaluation,2013-07-14,no_event,no_event,no_event,no_event
HOBBIES_1_001_CA_1_evaluation,2013-07-15,no_event,no_event,no_event,no_event
HOBBIES_1_001_CA_1_evaluation,2013-07-16,no_event,no_event,no_event,no_event
HOBBIES_1_001_CA_1_evaluation,2013-07-17,no_event,no_event,no_event,no_event


# Create a feature engineering pipeline

In [21]:
from sklearn.compose import (ColumnTransformer, 
                             make_column_selector,
                             make_column_transformer
                            )
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn import set_config
set_config(transform_output="pandas")

In [22]:
# Use feature union to make the pipeline 
pipeline = make_union(
    datetime_trafo,   # Extract date time features.
    time_since_trafo, # Extract time since earliest date in number of days.
    lag_window_trafo, # Extract lag and window features of sales and sell price.
                      # `lag_window_trafo` passes through all remaining columns.
    ColumnTransformer([("selector", 
                        "passthrough",
                       ["sell_price"])], # Keep `sell_price` column
                      verbose_feature_names_out=False,
                      remainder="drop"), 
    
)

pipeline

In [23]:
pipeline = make_pipeline(
                         pipeline, 
                         make_column_transformer(
                             (
                              MinMaxScaler(), 
                              make_column_selector(dtype_include=np.number)
                             ), 
                             verbose_feature_names_out=False)
                        )
pipeline

Change minmax scaler to operate on each time series.

In [24]:
# This might take a few minutes.
pipeline.fit(df)
df_result = pipeline.transform(df)
display(
        df_result.head(),
        df_result.columns
       )


 is `True`. In future releases this will be changed 
 to `False`. To keep the current behaviour explicitly 
 set `keep_original_columns=True`.


Unnamed: 0_level_0,Unnamed: 1_level_0,year,month_of_year,week_of_year,week_of_month,day_of_month,day_of_week,is_weekend,time_since_2000-01-01 00:00:00,y_lag_1,y_lag_2,...,sell_price_lag_7,sell_price_lag_14,sell_price_lag_28,sell_price_mean_1_7,sell_price_mean_1_14,sell_price_mean_1_28,snap_CA,snap_TX,snap_WI,sell_price
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
HOBBIES_1_001_CA_1_evaluation,2013-07-13,0.25,0.545455,0.519231,0.25,0.4,0.833333,1.0,0.348721,0.0,0.0,...,0.309009,0.309009,0.309009,0.309009,0.306995,0.287148,0.0,1.0,0.0,0.309009
HOBBIES_1_001_CA_1_evaluation,2013-07-14,0.25,0.545455,0.519231,0.25,0.433333,1.0,1.0,0.349345,0.0,0.0,...,0.309009,0.309009,0.309009,0.309009,0.306995,0.287148,0.0,0.0,1.0,0.309009
HOBBIES_1_001_CA_1_evaluation,2013-07-15,0.25,0.545455,0.538462,0.5,0.466667,0.0,0.0,0.349969,0.0,0.0,...,0.309009,0.309009,0.309009,0.309009,0.306995,0.287148,0.0,1.0,1.0,0.309009
HOBBIES_1_001_CA_1_evaluation,2013-07-16,0.25,0.545455,0.538462,0.5,0.5,0.166667,0.0,0.350593,0.0,0.0,...,0.309009,0.309009,0.309009,0.309009,0.306995,0.287148,0.0,0.0,0.0,0.309009
HOBBIES_1_001_CA_1_evaluation,2013-07-17,0.25,0.545455,0.538462,0.5,0.533333,0.333333,0.0,0.351216,0.0,0.0,...,0.309009,0.309009,0.309009,0.309009,0.306995,0.287148,0.0,0.0,0.0,0.309009


Index(['year', 'month_of_year', 'week_of_year', 'week_of_month',
       'day_of_month', 'day_of_week', 'is_weekend',
       'time_since_2000-01-01 00:00:00', 'y_lag_1', 'y_lag_2', 'y_lag_3',
       'y_lag_7', 'y_lag_14', 'y_lag_28', 'y_mean_1_7', 'y_mean_1_14',
       'y_mean_1_28', 'sell_price_lag_1', 'sell_price_lag_2',
       'sell_price_lag_3', 'sell_price_lag_7', 'sell_price_lag_14',
       'sell_price_lag_28', 'sell_price_mean_1_7', 'sell_price_mean_1_14',
       'sell_price_mean_1_28', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price'],
      dtype='object')

Let's add the target, `y`, back to the dataframe which contains all of our features `df_results` for convenience.

In [25]:
df_result["y"] = df["y"] 

In [26]:
df_result.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,month_of_year,week_of_year,week_of_month,day_of_month,day_of_week,is_weekend,time_since_2000-01-01 00:00:00,y_lag_1,y_lag_2,...,sell_price_lag_14,sell_price_lag_28,sell_price_mean_1_7,sell_price_mean_1_14,sell_price_mean_1_28,snap_CA,snap_TX,snap_WI,sell_price,y
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
HOBBIES_1_001_CA_1_evaluation,2013-07-13,0.25,0.545455,0.519231,0.25,0.4,0.833333,1.0,0.348721,0.0,0.0,...,0.309009,0.309009,0.309009,0.306995,0.287148,0.0,1.0,0.0,0.309009,0
HOBBIES_1_001_CA_1_evaluation,2013-07-14,0.25,0.545455,0.519231,0.25,0.433333,1.0,1.0,0.349345,0.0,0.0,...,0.309009,0.309009,0.309009,0.306995,0.287148,0.0,0.0,1.0,0.309009,0
HOBBIES_1_001_CA_1_evaluation,2013-07-15,0.25,0.545455,0.538462,0.5,0.466667,0.0,0.0,0.349969,0.0,0.0,...,0.309009,0.309009,0.309009,0.306995,0.287148,0.0,1.0,1.0,0.309009,0
HOBBIES_1_001_CA_1_evaluation,2013-07-16,0.25,0.545455,0.538462,0.5,0.5,0.166667,0.0,0.350593,0.0,0.0,...,0.309009,0.309009,0.309009,0.306995,0.287148,0.0,0.0,0.0,0.309009,0
HOBBIES_1_001_CA_1_evaluation,2013-07-17,0.25,0.545455,0.538462,0.5,0.533333,0.333333,0.0,0.351216,0.0,0.0,...,0.309009,0.309009,0.309009,0.306995,0.287148,0.0,0.0,0.0,0.309009,0


# Save training set and pipeline

In [27]:
import joblib

Dump our training features and target.

In [28]:
f_out = training_dir / "data.parquet"
df_result.to_parquet(f_out)

Dump our feature engineering pipeline, we'll need it at predict time when doing recursive forecasting.


In [29]:
f_out = pipeline_dir / f"pipeline.joblib"
joblib.dump(pipeline, f_out)

['../artifacts/pipeline/pipeline.joblib']