In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from utils import plot_boxplot, get_seasonality_trend_overview, plot_period_mean, plot_periodogram
import warnings
import statsmodels.api as sm
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

### Time based features

In [2]:
# Time Related Features
def create_date_features(df: pd.DataFrame):
    df['month'] = df.date.dt.month.astype('int8')
    df['day_of_month'] = df.date.dt.day.astype('int8')
    df['day_of_year'] = df.date.dt.dayofyear.astype('int16')
    df['week_of_month'] = (df.date.apply(lambda d: (d.day-1) // 7 + 1)).astype('int8')
    df['week_of_year'] = (df.date.dt.isocalendar().week()).astype('int8')
    df['day_of_week'] = (df.date.dt.dayofweek + 1).astype('int8') # since our transactions/sales depend on day of the week this feature will capture seasonality
    df['year'] = df.date.dt.year.astype('int32')
    df['is_wknd'] = (df.date.dt.weekday // 4).astype('int8')
    df['quarter'] = df.date.dt.quarter.astype('int8')
    df['is_month_start'] = df.date.dt.is_month_start.astype('int8')
    df['is_month_end'] = df.date.dt.is_month_end.astype('int8')
    df['is_quarter_start'] = df.date.dt.is_quarter_start.astype('int8')
    df['is_quarter_end'] = df.date.dt.is_quarter_end.astype('int8')
    df['is_year_start'] = df.date.dt.is_year_start.astype('int8')
    df['is_year_end'] = df.date.dt.is_year_end.astype('int8')
    df["date_index"] = df.date.factorize()[0]
    # 0: Winter - 1: Spring - 2: Summer - 3: Fall
    df['season'] = np.where(df.month.isin([12,1,2]), 0, 1)
    df['season'] = np.where(df.month.isin([6,7,8]), 2, df['season'])
    df['season'] = pd.Series(np.where(df.month.isin([9, 10, 11]), 3, df['season'])).astype('int8')
    return df

### Work related features

In [4]:
def create_work_related_features(df: pd.DataFrame):
    df['workday'] = np.where((df.holiday_national_binary == 1) | (df.holiday_local_binary==1) | (df.holiday_regional_binary==1) | (df['day_of_week'].isin([6,7])), 0, 1)
    df['workday'] = pd.Series(np.where(df.IsWorkDay.notnull(), 1, df['workday'])).astype('int8')
    df.drop('IsWorkDay', axis = 1, inplace = True)
    df['wageday'] = pd.Series(np.where((df['is_month_end'] == 1) | (df['day_of_month'] == 15), 1, 0)).astype('int8')
    return df

### Moving Average features

In [5]:
def create_moving_average_features(df: pd.DataFrame):
    a = df.sort_values(["store_nbr", "family", "date"])
    for i in [20, 30, 45, 60, 90, 120, 365, 730]:
        a["SMA"+str(i)+"_sales_lag16"] = a.groupby(["store_nbr", "family"]).rolling(i).sales.mean().shift(16).values
        a["SMA"+str(i)+"_sales_lag30"] = a.groupby(["store_nbr", "family"]).rolling(i).sales.mean().shift(30).values
        a["SMA"+str(i)+"_sales_lag60"] = a.groupby(["store_nbr", "family"]).rolling(i).sales.mean().shift(60).values

### Lag features

In [6]:
#16, 17, 18, 19, 20, 30, 365, 730
def create_lag_features(ts: pd.DataFrame, lags: int):
    return pd.concat(
        {
            f'y_lag_{i}': ts.shift(i)
            for i in range(1, lags + 1)
        },
        axis=1)

### Holiday features

In [71]:
from functools import cache

@cache
def days_to_holiday(date, holidays, from_holiday=True):
    days_diff = (date - holidays) if from_holiday else (holidays - date)
    nearest_days = np.min(np.abs(days_diff))
    return nearest_days


def create_holiday_features(df: pd.DataFrame, df_holidays: pd.DataFrame):
    df['day_to_nearest_holiday'] = df['date'].apply(lambda x: days_to_holiday(x, df_holidays['date']))
    df['day_from_nearest_holiday'] = df['date'].apply(lambda x: days_to_holiday(x, df_holidays['date'],  from_holiday=False))
    return df  

### Zero forecasting

### Oil features

In [None]:
def create_oil_features(df: pd.DataFrame):
    n_lags = 3
    for l in range(1, n_lags + 1):
        df[f'oil_lags{l}'] = df.avg_oil.shift(l)