DSCS_25_HW3

Provide Forecast for each triple produc_rk/store_location_rk/week. Forecast horizon: 02Dec2019, 09Dec2019, 16Dec2019, 23Dec2019, 30Dec2019.

File descriptions

    train.csv - the training set
    test.csv - the test set
    sample_submission.csv - a sample submission file in the correct format
    store_location.csv - attributes of stores (data is obfuscated)

Data fields

    product_rk - id of a product, there is only 6 different products in data set
    store_location_rk - id of a store
    period_start_dt - date (a Monday date of a week)
    demand - target variable (double), is missing for the forecasting period (i.e. since 02Dec2019 to 30Dec2019)
    ... - other fields in train set are demand driver information

Evaluation

Submission file structure

id, predicted
908,1


Score metric is SMAPE.

In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [4]:
import pandas as pd
import numpy as np
from copy import deepcopy
from catboost import CatBoostRegressor, Pool

In [5]:
all_data = pd.read_csv('train.csv', delimiter=',')
all_data['period_start_dt'] = pd.to_datetime(all_data['period_start_dt'])
all_data.rename(columns={'Unnamed: 0': 'id'}, inplace=True)

TS_ID = ['product_rk', 'store_location_rk', 'period_start_dt']
AGG_ID = ['product_rk', 'period_start_dt', 'store_location_rk']

def ts_fillna_optimized(df, col):
    df = df.sort_values(TS_ID)
    df[col] = df.groupby(TS_ID[:-1])[col].ffill().bfill()
    mean_val = df.groupby(AGG_ID[:2])[col].transform('mean')
    df[col] = df[col].fillna(mean_val)
    return df

for col in ['PRICE_REGULAR', 'PRICE_AFTER_DISC', 'AUTORIZATION_FLAG']:
    all_data = ts_fillna_optimized(all_data, col)

all_data = all_data[all_data['store_location_rk'] != 309]
all_data['Promo'] = all_data['PROMO1_FLAG'].fillna(0)
all_data['product_rk'] = all_data['product_rk'].astype(str)
all_data['store_location_rk'] = all_data['store_location_rk'].astype(str)

mask_history = all_data['period_start_dt'] < '2019-12-02'
all_data.loc[mask_history, 'demand'] = all_data.loc[mask_history, 'demand'].fillna(0)

In [6]:
global_demand = all_data.groupby(['period_start_dt', 'product_rk'])['demand'].transform('mean')
all_data['global_prod_demand'] = global_demand

all_data['month'] = all_data['period_start_dt'].dt.month
all_data['week'] = all_data['period_start_dt'].dt.isocalendar().week.astype(int)
all_data['is_december'] = (all_data['month'] == 12).astype(int)
all_data['is_ny_week'] = (all_data['week'] == 1).astype(int)
all_data['weeks_to_ny'] = 52 - all_data['week']

In [7]:
def get_features_for_date(df, current_date):
    df_curr = df[df['period_start_dt'] <= current_date].copy()

    df_curr = df_curr.sort_values(['product_rk', 'store_location_rk', 'period_start_dt'])

    df_curr['demand_lag52'] = df_curr.groupby(['product_rk', 'store_location_rk'])['demand'].shift(52)

    df_curr['global_demand_lag52'] = df_curr.groupby(['product_rk', 'store_location_rk'])['global_prod_demand'].shift(52)

    for lag in [1, 2, 3, 4]:
        df_curr[f'demand_lag{lag}'] = df_curr.groupby(['product_rk', 'store_location_rk'])['demand'].shift(lag)

    df_curr['roll_mean_4w'] = df_curr.groupby(['product_rk', 'store_location_rk'])['demand'].shift(1).rolling(4).mean()

    df_curr['discount_depth'] = 1 - (df_curr['PRICE_AFTER_DISC'] / (df_curr['PRICE_REGULAR'] + 0.01))

    return df_curr[df_curr['period_start_dt'] == current_date]

In [8]:
full_df = all_data.sort_values(['product_rk', 'store_location_rk', 'period_start_dt']).copy()

full_df['demand_lag52'] = full_df.groupby(['product_rk', 'store_location_rk'])['demand'].shift(52)
full_df['global_demand_lag52'] = full_df.groupby(['product_rk', 'store_location_rk'])['global_prod_demand'].shift(52)
for lag in [1, 2, 3, 4]:
    full_df[f'demand_lag{lag}'] = full_df.groupby(['product_rk', 'store_location_rk'])['demand'].shift(lag)

full_df['roll_mean_4w'] = full_df.groupby(['product_rk', 'store_location_rk'])['demand'].shift(1).rolling(4).mean()
full_df['discount_depth'] = 1 - (full_df['PRICE_AFTER_DISC'] / (full_df['PRICE_REGULAR'] + 0.01))

full_df = full_df.fillna(0)

# Сплит
val_start = '2019-11-01'
test_start = '2019-12-02'

train_df = full_df[full_df['period_start_dt'] < val_start]
val_df = full_df[(full_df['period_start_dt'] >= val_start) & (full_df['period_start_dt'] < test_start)]

features = [
    'product_rk', 'store_location_rk',
    'month', 'week', 'is_december', 'is_ny_week', 'weeks_to_ny',
    'PRICE_REGULAR', 'PRICE_AFTER_DISC', 'discount_depth', 'Promo',
    'demand_lag52', 'global_demand_lag52',
    'demand_lag1', 'demand_lag2', 'demand_lag3', 'demand_lag4',
    'roll_mean_4w'
]

cat_features = ['product_rk', 'store_location_rk']
cat_idxs = [features.index(c) for c in cat_features]

In [10]:
models = []
seeds = [42, 777, 2024]

for seed in seeds:
    model = CatBoostRegressor(
        iterations=2000,
        learning_rate=0.02,
        depth=6,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=seed,
        early_stopping_rounds=150,
        verbose=200,
        l2_leaf_reg=3
    )

    model.fit(
        train_df[features], np.log1p(train_df['demand']),
        eval_set=(val_df[features], np.log1p(val_df['demand'])),
        cat_features=cat_idxs
    )
    models.append(model)

0:	learn: 0.9104054	test: 0.8592709	best: 0.8592709 (0)	total: 35.3ms	remaining: 1m 10s
200:	learn: 0.5291639	test: 0.4401973	best: 0.4401973 (200)	total: 5.98s	remaining: 53.5s
400:	learn: 0.5060673	test: 0.4392049	best: 0.4367458 (341)	total: 11.4s	remaining: 45.6s
600:	learn: 0.4854319	test: 0.4311805	best: 0.4310183 (589)	total: 16.7s	remaining: 39s
800:	learn: 0.4716476	test: 0.4229929	best: 0.4228055 (795)	total: 23.5s	remaining: 35.1s
1000:	learn: 0.4588433	test: 0.4201283	best: 0.4199357 (932)	total: 28.7s	remaining: 28.6s
1200:	learn: 0.4479621	test: 0.4187032	best: 0.4180940 (1078)	total: 35.2s	remaining: 23.4s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 0.4180940419
bestIteration = 1078

Shrink model to first 1079 iterations.
0:	learn: 0.9111810	test: 0.8592944	best: 0.8592944 (0)	total: 32.2ms	remaining: 1m 4s
200:	learn: 0.5285093	test: 0.4388911	best: 0.4388911 (200)	total: 5.17s	remaining: 46.2s
400:	learn: 0.5065316	test: 0.4348216	best: 0.4348216

In [12]:
val_preds_log = np.zeros(len(val_df))
for model in models:
    val_preds_log += model.predict(val_df[features]) / len(models)

val_preds = np.expm1(val_preds_log)
val_preds = np.maximum(val_preds, 0)

threshold = 0.6 # It was found with grid
val_preds_rounded = np.floor(val_preds) + (val_preds % 1 > threshold).astype(int)

def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-10))

print(f"SMAPE: {smape(val_df['demand'].values, val_preds_rounded):.4f}")

SMAPE: 50.2509


In [17]:
#If last 8 weeks demand were 0 -> set 0
def get_zero_window_flags(df, current_date, window_weeks=8):
    start_date = current_date - pd.DateOffset(weeks=window_weeks)

    history = df[
        (df['period_start_dt'] < current_date) &
        (df['period_start_dt'] >= start_date)
    ][['product_rk', 'store_location_rk', 'period_start_dt', 'demand']]

    agg = (
        history
        .groupby(['product_rk', 'store_location_rk'])['demand']
        .sum()
        .reset_index()
    )
    agg['zero_window_flag'] = (agg['demand'] == 0).astype(int)
    agg = agg.drop(columns=['demand'])

    return agg

In [15]:
forecast_df = all_data.copy()
forecast_dates = sorted(
    all_data[all_data['period_start_dt'] >= test_start]['period_start_dt'].unique()
)

start_history = pd.to_datetime(test_start) - pd.DateOffset(weeks=53)
recursive_df = forecast_df[forecast_df['period_start_dt'] >= start_history].copy()

for current_date in forecast_dates:
    step_df = get_features_for_date(recursive_df, current_date)
    step_df = step_df.fillna(0)

    zero_flags = get_zero_window_flags(recursive_df, current_date, window_weeks=8)
    step_df = step_df.merge(
        zero_flags,
        on=['product_rk', 'store_location_rk'],
        how='left'
    )
    step_df['zero_window_flag'] = step_df['zero_window_flag'].fillna(0)

    X_test = step_df[features]
    if len(X_test) > 0:
        preds_log = np.zeros(len(X_test))
        for model in models:
            preds_log += model.predict(X_test) / len(models)

        preds = np.expm1(preds_log)
        preds = np.maximum(preds, 0)

        preds = np.floor(preds) + (preds % 1 > threshold).astype(int)

        preds[step_df['zero_window_flag'].values == 1] = 0

        updates = step_df[['product_rk', 'store_location_rk', 'period_start_dt']].copy()
        updates['demand'] = preds

        recursive_df = recursive_df.set_index(['product_rk', 'store_location_rk', 'period_start_dt'])
        updates = updates.set_index(['product_rk', 'store_location_rk', 'period_start_dt'])
        recursive_df.update(updates)
        recursive_df = recursive_df.reset_index()

In [16]:
submission = recursive_df.loc[recursive_df['period_start_dt'] >= test_start, ['id', 'demand']].copy()
submission.rename(columns={'demand': 'predicted'}, inplace=True)
submission['predicted'] = submission['predicted'].astype(int)
submission.to_csv('submission.csv', index=False)