# M5 Sales prediction [sklearn, lgbm]
Prediction of the products' demand for the next 28 days.
Reference: <https://www.kaggle.com/code/ragnar123/very-fst-model>

## Import libraries

In [None]:
import os
import warnings

import numpy as np
import pandas as pd
import lightgbm as lgbm
from sklearn import preprocessing, metrics

import giskard
from giskard import wrap_dataset, wrap_model

## Notebook settings

In [None]:
warnings.filterwarnings('ignore')

## Define constants

In [None]:
# Constants.
ID_COLUMN = "id"
TARGET_COLUMN = "demand"
SPLIT_DATE = "2016-03-27"

# Paths.
DATA_DIR = os.path.join(".", "datasets", "m5_sales_prediction")

## Load and preprocess data

In [None]:
def load_data(n_series_use=100):
    """Load necessary data files."""
    print('Loading data...')

    # Calendar data.
    _calendar_df = pd.read_csv(os.path.join(DATA_DIR, "calendar.csv"))
    print(f'Calendar has {_calendar_df.shape[0]} rows and {_calendar_df.shape[1]} columns')

    # Prices data.
    _prices_df = pd.read_csv(os.path.join(DATA_DIR, 'sell_prices.csv'))
    print(f'Sell prices has {_prices_df.shape[0]} rows and {_prices_df.shape[1]} columns')

    # Sales data.
    _sales_df = pd.read_csv(os.path.join(DATA_DIR, 'sales_train_validation.csv'))
    _sales_df = _sales_df.iloc[:n_series_use]
    print(f'Sales train validation has {_sales_df.shape[0]} rows and {_sales_df.shape[1]} columns')

    print("Data is loaded!")

    return _calendar_df, _prices_df, _sales_df

In [None]:
def preprocess_data(_calendar_df, _prices_df, _sales_df):
    """Preprocess and create df with the whole data."""
    print("Preprocessing data...")

    # Melt the sales data: translate columnar demand representation into single target vector.
    _data = pd.melt(_sales_df,
                    id_vars=[ID_COLUMN, 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                    var_name='day', value_name=TARGET_COLUMN)

    # Add the calendar data.
    _calendar_df.drop(['weekday', 'wday', 'month', 'year'], inplace=True, axis=1)
    _data = pd.merge(_data, _calendar_df, how ='left', left_on=['day'], right_on=['d'])
    _data.drop(['d', 'day'], inplace=True, axis=1)

    # Add the sell price data.
    _data = _data.merge(_prices_df, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')

    #TODO: Push below steps into preprocessing function of wrap_model.
    # Fill NaN values.
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        _data[feature].fillna('unknown', inplace=True)

    # Encode categorical features.
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        _data[feature] = encoder.fit_transform(_data[feature])

    print(f'Final dataset has {_data.shape[0]} rows and {_data.shape[1]} columns')
    print("Data preprocessed!")

    return _data

In [None]:
data = preprocess_data(*load_data())

## Enrich data with rolling features

In [None]:
def add_features(_data):
    """Add new features."""
    # Add rolling demand features. For each record features are calculated in the time-window of range [shift + rolling; shift].
    print("Producing rolling features...")
    _data['lag_t28'] = _data.groupby([ID_COLUMN])[TARGET_COLUMN].transform(lambda x: x.shift(28))
    _data['lag_t29'] = _data.groupby([ID_COLUMN])[TARGET_COLUMN].transform(lambda x: x.shift(29))
    _data['lag_t30'] = _data.groupby([ID_COLUMN])[TARGET_COLUMN].transform(lambda x: x.shift(30))

    _data['rolling_mean_t7'] = _data.groupby([ID_COLUMN])[TARGET_COLUMN].transform(lambda x: x.shift(28).rolling(7).mean())
    _data['rolling_std_t7'] = _data.groupby([ID_COLUMN])[TARGET_COLUMN].transform(lambda x: x.shift(28).rolling(7).std())

    _data['rolling_mean_t30'] = _data.groupby([ID_COLUMN])[TARGET_COLUMN].transform(lambda x: x.shift(28).rolling(30).mean())
    _data['rolling_std_t30'] = _data.groupby([ID_COLUMN])[TARGET_COLUMN].transform(lambda x: x.shift(28).rolling(30).std())
    _data['rolling_skew_t30'] = _data.groupby([ID_COLUMN])[TARGET_COLUMN].transform(lambda x: x.shift(28).rolling(30).skew())
    _data['rolling_kurt_t30'] = _data.groupby([ID_COLUMN])[TARGET_COLUMN].transform(lambda x: x.shift(28).rolling(30).kurt())

    _data['rolling_mean_t90'] = _data.groupby([ID_COLUMN])[TARGET_COLUMN].transform(lambda x: x.shift(28).rolling(90).mean())
    _data['rolling_mean_t180'] = _data.groupby([ID_COLUMN])[TARGET_COLUMN].transform(lambda x: x.shift(28).rolling(180).mean())

    # Add price features.
    print("Producing price features...")
    _data['lag_price_t1'] = _data.groupby([ID_COLUMN])['sell_price'].transform(lambda x: x.shift(1))
    _data['price_change_t1'] = (_data['lag_price_t1'] - _data['sell_price']) / (_data['lag_price_t1'])

    _data['rolling_price_max_t365'] = _data.groupby([ID_COLUMN])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())
    _data['price_change_t365'] = (_data['rolling_price_max_t365'] - _data['sell_price']) / (_data['rolling_price_max_t365'])

    _data['rolling_price_std_t7'] = _data.groupby([ID_COLUMN])['sell_price'].transform(lambda x: x.rolling(7).std())
    _data['rolling_price_std_t30'] = _data.groupby([ID_COLUMN])['sell_price'].transform(lambda x: x.rolling(30).std())

    _data.drop(['rolling_price_max_t365', 'lag_price_t1'], inplace = True, axis = 1)

    # Add time features.
    print("Producing time features...")
    _data['date'] = pd.to_datetime(_data['date'])
    _data['year'] = _data['date'].dt.year
    _data['month'] = _data['date'].dt.month
    _data['week'] = _data['date'].dt.week
    _data['day'] = _data['date'].dt.day
    _data['dayofweek'] = _data['date'].dt.dayofweek

    print("Features added!")

    return _data

In [None]:
data = add_features(data)

## Train-validation split

In [None]:
def train_val_split(_data):
    """Perform train/val split, where the split point is the date '2016-03-27'. Validation records are 28 days for each product."""

    print("Splitting data...")
    # Train data.
    x_train = _data[_data['date'] <= SPLIT_DATE]
    y_train = x_train[TARGET_COLUMN]

    # Validation data.
    x_val = _data[_data['date'] > SPLIT_DATE]
    y_val = x_val[TARGET_COLUMN]

    # Drop date feature.
    x_train.drop("date", inplace=True, axis=1)
    x_val.drop("date", inplace=True, axis=1)

    print("Data was split!")
    print(f"Train samples: {len(x_train)}\n"
          f"Valid samples: {len(x_val)}")

    return x_train, y_train, x_val, y_val

In [None]:
X_train, Y_train, X_val, Y_val = train_val_split(data)

## Filter necessary features

In [None]:
def filter_features(_x):
    _FEATURES_USE = [
        'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'week', 'day', 'dayofweek',
        'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price',
        'lag_t28', 'lag_t29', 'lag_t30', 'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90',
        'rolling_mean_t180', 'rolling_std_t30', 'price_change_t1', 'price_change_t365', 'rolling_price_std_t7',
        'rolling_price_std_t30', 'rolling_skew_t30', 'rolling_kurt_t30'
    ]
    return _x[_FEATURES_USE]

In [None]:
X_train = filter_features(X_train)
X_val = filter_features(X_val)

## Build and fit estimator

In [None]:
def build_model(x_train, y_train, x_val, y_val):
    _ESTIMATOR_PARAMS = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'regression',
        'n_jobs': -1,
        'seed': 236,
        'learning_rate': 0.1,
        'bagging_fraction': 0.75,
        'bagging_freq': 10,
        'colsample_bytree': 0.75,
        'n_estimators': 200
    }

    # Fit estimator.
    print(f"Model training...")
    estimator = lgbm.LGBMRegressor(**_ESTIMATOR_PARAMS)
    estimator.fit(x_train, y_train)

    # Validate estimator.
    print(f"Model validation...")
    val_pred = estimator.predict(x_val)
    val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_val))
    print(f'Validation RMSE-score: {val_score}')

    return estimator

In [None]:
model = build_model(X_train, Y_train, X_val, Y_val)

## Wrap data and estimator

In [None]:
# Wrap dataset.
categoricals = X_val.select_dtypes(int).columns.tolist()
raw_dataset = pd.concat([X_val, Y_val], axis=1)
wrapped_dataset = wrap_dataset(raw_dataset,
                               name="m5_products_timeseries_dataset",
                               target=TARGET_COLUMN,
                               cat_columns=categoricals)

In [None]:
# Wrap model.
wrapped_model = wrap_model(model,
                           model_type="regression",
                           name="m5_timeseries_regressor",
                           feature_names=X_val.columns)

## Scan model

In [None]:
scanning_results = giskard.scan(wrapped_model, wrapped_dataset)

In [None]:
display(scanning_results)