## Rossmann Store Sales

> Forecast sales using store, promotion, and competitor data

Is a Kaggle [competition](https://www.kaggle.com/c/rossmann-store-sales).

For the Rossmann dataset, we used the same preprocessing and data split as [Prokhorenkova et al., 2018a]() – data from 2014
was used for training and validation, whereas 2015 was used for testing. We split 100k samples for validation from
the training dataset, and after the optimization of the hyperparameters, we retrained on the entire training dataset.

## Prokhorenkova et al., 2018a
Liudmila Prokhorenkova, Gleb Gusev, Aleksandr Vorobev, Anna Veronika Dorogush, and Andrey Gulin. Catboost:
unbiased boosting with categorical features. 32nd Conference on Neural Information Processing Systems (NeurIPS),
2018a.

Repository with benchmarks - [rossman-store-sales](https://github.com/catboost/benchmarks/tree/master/kaggle/rossmann-store-sales).

In [None]:
import calendar
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
# This does not work on headless Linux systems.
from pandasgui import show

In [None]:
def print_nan_statistics(_name: str, _df: pd.DataFrame):
    for _column in _df.columns:
        _nan_mask = _df[_column].isnull()
        if _nan_mask.any():
            print(
                f"name={_name}, column={_column}, num_unique_values={len(_df[_column].unique())}, num_nans={_nan_mask.sum()} dtype={_df[_column].dtype}"
            )

In [None]:
data_dir = Path("~/.cache/kaggle/datasets/rossmann_store_sales").expanduser()
store_file = "store.csv.gz"
train_file = "train.csv.gz"

train: pd.DataFrame = pd.read_csv((data_dir / train_file).as_posix())
store: pd.DataFrame = pd.read_csv((data_dir / store_file).as_posix())

print(f"train: shape={train.shape}, columns={list(train.columns)}")
print(f"store: shape={store.shape}, columns={list(store.columns)}")

print(train.dtypes)

In [None]:
# https://docs.python.org/3/library/calendar.html#calendar.month_abbr
month_abbrs = calendar.month_abbr[1:]
# It's `Sep` by default, but dataset uses Sept.
month_abbrs[8] = "Sept"

In [None]:
# StateHoliday - indicates a state holiday. Normally all stores, with few exceptions, are closed on state holidays.
# Note that all schools are closed on public holidays and weekends. a = public holiday, b = Easter holiday, c = Christmas, 0 = None
train["StateHoliday"].replace(0, "n", inplace=True)

In [None]:
# Convert Date column (e.g., 2015-07-31) into three integer columns - year, month and day
train[["Year", "Month", "Day"]] = train["Date"].str.split("-", n=3, expand=True).astype(int)
train.drop(["Date"], axis=1, inplace=True)

In [None]:
print_nan_statistics("train", train)
print_nan_statistics("store", store)

# Promo2 - Promo2 is a continuing and consecutive promotion for some stores: 0 = store is not participating, 1 = store is participating
# Promo2Since[Year/Week] - describes the year and calendar week when the store started participating in Promo2

# Promo2SinceWeek and Promo2SinceYear are NaNs when Promo2 is 0 (0 = store is not participating)
# Do not care that much about PromoInterval - it will be removed later.
print(store["Promo2"].value_counts().to_dict())

In [None]:
# Join with store table
train = train.join(store, on="Store", rsuffix="_right")
train.drop(["Store_right"], axis=1, inplace=True)

In [None]:
# Convert `PromoInterval` (e.g., Jan,Apr,Jul,Oct) into binary variables
promo2_start_months = [(s.split(",") if not pd.isnull(s) else []) for s in train["PromoInterval"]]
for month_abbr in month_abbrs:
    train["Promo2Start_" + month_abbr] = np.array([(1 if month_abbr in s else 0) for s in promo2_start_months])
train.drop(["PromoInterval"], axis=1, inplace=True)

In [None]:
print_nan_statistics("train", train)

In [None]:
show(train)

In [None]:
# StoreType - differentiates between 4 different store models: a, b, c, d
train["StoreType"].fillna("na", inplace=True)
# Assortment - describes an assortment level: a = basic, b = extra, c = extended
train["Assortment"].fillna("na", inplace=True)

# CompetitionDistance - distance in meters to the nearest competitor store
train["CompetitionDistance"].fillna(-1, inplace=True)
train["CompetitionOpenSinceMonth"].fillna(0, inplace=True)
train["CompetitionOpenSinceYear"].fillna(0, inplace=True)

# Promo2 - Promo2 is a continuing and consecutive promotion for some stores: 0 = store is not participating, 1 = store is participating
train["Promo2"].fillna(0, inplace=True)
train["Promo2SinceWeek"].fillna(-1, inplace=True)
train["Promo2SinceYear"].fillna(-1, inplace=True)

train["Promo2"] = train["Promo2"].astype(int)

In [None]:
print_nan_statistics("train", train)

In [None]:
train.head(n=5)

In [None]:
# Split into train/test splits
train_indices = train[train["Year"] == 2014].index
test_indices = train[train["Year"] == 2015].index

train_split = train.iloc[train_indices].reset_index(drop=True)
test_split = train.iloc[test_indices].reset_index(drop=True)

print(f"train_split={train_split.shape}, test_split={test_split.shape}")

In [None]:
show(train_split)

In [None]:
m = test_split["Sales"].mean()

In [None]:
515.450 / m

In [None]:
643.804 / m - 515.450 / m

In [None]:
# These are `object` columns (strings)
string_columns = ["StateHoliday", "StoreType", "Assortment"]

for column in string_columns:
    encoder = LabelEncoder()
    train_split[column] = encoder.fit_transform(train_split[column])
    test_split[column] = encoder.transform(test_split[column])

In [None]:
# Remove 'Year' column since it's irrelevant here
train_split.drop("Year", axis=1, inplace=True)
test_split.drop("Year", axis=1, inplace=True)

In [None]:
show(train_split)

In [None]:
label = "Sales"

In [None]:
from sklearn.dummy import DummyRegressor

from xtime.datasets import Dataset, DatasetMetadata, DatasetSplit
from xtime.estimators import Estimator
from xtime.ml import Feature, FeatureType, RegressionTask

features = [
    Feature("Store", FeatureType.NOMINAL),
    Feature("DayOfWeek", FeatureType.NOMINAL),
    Feature("Customers", FeatureType.CONTINUOUS),
    Feature("Open", FeatureType.BINARY),
    Feature("Promo", FeatureType.BINARY),
    Feature("StateHoliday", FeatureType.NOMINAL),
    Feature("SchoolHoliday", FeatureType.BINARY),
    Feature("Month", FeatureType.CONTINUOUS),
    Feature("Day", FeatureType.CONTINUOUS),
    Feature("StoreType", FeatureType.NOMINAL),
    Feature("Assortment", FeatureType.NOMINAL),
    Feature("CompetitionDistance", FeatureType.CONTINUOUS),
    Feature("CompetitionOpenSinceMonth", FeatureType.CONTINUOUS),
    Feature("CompetitionOpenSinceYear", FeatureType.CONTINUOUS),
    Feature("Promo2", FeatureType.BINARY),
    Feature("Promo2SinceWeek", FeatureType.CONTINUOUS),
    Feature("Promo2SinceYear", FeatureType.CONTINUOUS),
]
for month_abbr in month_abbrs:
    features.append(Feature(f"Promo2Start_{month_abbr}", FeatureType.BINARY))

dataset = Dataset(
    metadata=DatasetMetadata(
        name="rossman_store_sales",
        version="NA",
        features=features,
        task=RegressionTask(),
    ),
    splits={
        "train": DatasetSplit(x=train_split.drop(label, axis=1, inplace=False), y=train_split[label]),
        "valid": DatasetSplit(x=test_split.drop(label, axis=1, inplace=False), y=test_split[label]),
    },
)

estimator = Estimator()
estimator.model = DummyRegressor(strategy="mean").fit(dataset.splits["train"].x, dataset.splits["train"].y)

metrics = estimator.evaluate(dataset)
print(metrics)

In [None]:
from sklearn.linear_model import LinearRegression

estimator = Estimator()
estimator.model = LinearRegression(copy_X=False).fit(dataset.splits["train"].x, dataset.splits["train"].y)

metrics = estimator.evaluate(dataset)
print(metrics)