In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import sys
import warnings
warnings.filterwarnings("ignore")

sys.path.append('../src')

import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit, cross_validate


from sales_project.metrics import mean_cv_scores, evaluate
from sales_project.models import ClippingRegressor, sequential_predictions
from sales_project.utils import save_predictions, save_pkl, save_dict_as_json, reduce_size

In [2]:
df = pd.read_csv('../data/artifacts/df_with_fe.csv', parse_dates=['date'])
reduce_size(df)

  0%|          | 0/64 [00:00<?, ?it/s]

In [3]:
target = "relative_sales"
num_cols = [
    "onpromotion",
    "dcoilwtico",
    "transactions",
    "scaled_dcoilwtico",
    "year",
    "dcoilwtico.lag.1",
    "transactions.lag.1",
    "onpromotion.lag.1",
    "relative_sales.lag.1",
    "relative_sales.rolling.mean.window.7.lag.1",
    "relative_sales.expanding.mean.lag.1",
    "dcoilwtico.lag.2",
    "transactions.lag.2",
    "onpromotion.lag.2",
    "relative_sales.lag.2",
    "relative_sales.rolling.mean.window.7.lag.2",
    "relative_sales.expanding.mean.lag.2",
    "dcoilwtico.lag.3",
    "transactions.lag.3",
    "onpromotion.lag.3",
    "relative_sales.lag.3",
    "relative_sales.rolling.mean.window.7.lag.3",
    "relative_sales.expanding.mean.lag.3",
    "dcoilwtico.lag.4",
    "transactions.lag.4",
    "onpromotion.lag.4",
    "relative_sales.lag.4",
    "relative_sales.rolling.mean.window.7.lag.4",
    "relative_sales.expanding.mean.lag.4",
    "dcoilwtico.lag.5",
    "transactions.lag.5",
    "onpromotion.lag.5",
    "relative_sales.lag.5",
    "relative_sales.rolling.mean.window.7.lag.5",
    "relative_sales.expanding.mean.lag.5",
    "dcoilwtico.lag.6",
    "transactions.lag.6",
    "onpromotion.lag.6",
    "relative_sales.lag.6",
    "relative_sales.rolling.mean.window.7.lag.6",
    "relative_sales.expanding.mean.lag.6",
    "dcoilwtico.lag.7",
    "transactions.lag.7",
    "onpromotion.lag.7",
    "relative_sales.lag.7",
    "relative_sales.rolling.mean.window.7.lag.7",
    "relative_sales.expanding.mean.lag.7",
]
cbe_cols = [
    "store_nbr",
    "family",
    "city",
    "state",
    "type",
    "cluster",
    "year",
    "month",
    "weekday",
]
ohe_cols = ["is_promoted", "type", "cluster", "year", "month", "weekday"]


# LightGBM. Kaggle RMSLE score: 0.49560

In [4]:
def init_pipeline(num_cols, cbe_cols, ohe_cols, min_value, max_value):

    preprocessor = ColumnTransformer(
        [
            ("num", "passthrough", num_cols),
            (
                "cbe",
                CatBoostEncoder(cols=cbe_cols),
                cbe_cols,
            ),
            (
                "ohe",
                OneHotEncoder(drop='if_binary'),
                ohe_cols,
            ),
        ],
        remainder="drop",
        verbose_feature_names_out=True,
    )

    model = ClippingRegressor(
        base_estimator=LGBMRegressor(
                verbose=-1,
                n_jobs=-1,
                random_state=42,
        ),
        min_value=min_value,
        max_value=max_value,
    )

    pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("model", model),
        ]
    )

    return pipeline

In [5]:
pipeline = init_pipeline(num_cols, cbe_cols, ohe_cols, min_value=1e-8, max_value=df[target].max())
pipeline.fit(df.query('subset != "submission"'), df.query('subset != "submission"')[target])
save_pkl(model=pipeline, path=Path('../models/feateng_lightgbm3.pkl'))

df = sequential_predictions(
    pipeline=pipeline,
    data=df,
    target_col=target,
    timestamp_col='date',
    start_date=df.query('subset == "submission"')['date'].min(),
    end_date=df.query('subset == "submission"')['date'].max(),
)

df['sales'] = df[target] * df['median_sales_over_family']
save_predictions(df.query('subset == "submission"'), filename='feateng_lightgbm3.csv')

Model file saved at: ../models/feateng_lightgbm3.pkl


  0%|          | 0/16 [00:00<?, ?it/s]

csv file saved at: ../data/predictions/feateng_lightgbm3.csv


In [5]:
pipeline = init_pipeline(num_cols, cbe_cols, ohe_cols, min_value=1e-8, max_value=df[target].max())
pipeline.fit(df.query('subset == "train"'), df.query('subset == "train"')[target])

y_true = df.query('subset == "test"')[target]
df = sequential_predictions(
    pipeline=pipeline,
    data=df,
    target_col=target,
    timestamp_col='date',
    start_date=df.query('subset == "test"')['date'].min(),
    end_date=df.query('subset == "test"')['date'].max(),
)

metrics = evaluate(y_true, df.query("subset == 'test'")[target])
save_dict_as_json(data=metrics, path=Path('../scores/test_feateng_lightgbm3.json'))
metrics

  0%|          | 0/92 [00:00<?, ?it/s]

JSON file saved at: ../scores/test_feateng_lightgbm3.json


{'MAE': 0.5402601455114616,
 'RMSE': 0.8528880947479247,
 'RMSLE': 0.31402093481445087,
 'R2': 0.658074558889086}

# XGBoost. Kaggle RMSLE score: 0.51744

In [10]:
df = pd.read_csv('../data/artifacts/df_with_fe.csv', parse_dates=['date'])
reduce_size(df)

In [7]:
def init_pipeline(num_cols, cbe_cols, ohe_cols, min_value, max_value):

    preprocessor = ColumnTransformer(
        [
            ("num", "passthrough", num_cols),
            (
                "cbe",
                CatBoostEncoder(cols=cbe_cols),
                cbe_cols,
            ),
            (
                "ohe",
                OneHotEncoder(drop='if_binary'),
                ohe_cols,
            ),
        ],
        remainder="drop",
        verbose_feature_names_out=True,
    )

    model = ClippingRegressor(
        XGBRegressor(
            verbosity=0,
            n_jobs=-1,
            random_state=42,
        ),
        min_value=min_value,
        max_value=max_value,
    )

    pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("model", model),
        ]
    )

    return pipeline

In [8]:
pipeline = init_pipeline(num_cols, cbe_cols, ohe_cols, min_value=1e-8, max_value=df[target].max())
pipeline.fit(df.query('subset != "submission"'), df.query('subset != "submission"')[target])
save_pkl(model=pipeline, path=Path('../models/feateng_xgboost3.pkl'))

df = sequential_predictions(
    pipeline=pipeline,
    data=df,
    target_col=target,
    timestamp_col='date',
    start_date=df.query('subset == "submission"')['date'].min(),
    end_date=df.query('subset == "submission"')['date'].max(),
)

df['sales'] = df[target] * df['median_sales_over_family']
save_predictions(df.query('subset == "submission"'), filename='feateng_xgboost3.csv')

Model file saved at: ../models/feateng_xgboost3.pkl


  0%|          | 0/16 [00:00<?, ?it/s]

csv file saved at: ../data/predictions/feateng_xgboost3.csv


In [9]:
pipeline = init_pipeline(num_cols, cbe_cols, ohe_cols, min_value=1e-8, max_value=df[target].max())
pipeline.fit(df.query('subset == "train"'), df.query('subset == "train"')[target])

y_true = df.query('subset == "test"')[target]
df = sequential_predictions(
    pipeline=pipeline,
    data=df,
    target_col=target,
    timestamp_col='date',
    start_date=df.query('subset == "test"')['date'].min(),
    end_date=df.query('subset == "test"')['date'].max(),
)

metrics = evaluate(y_true, df.query("subset == 'test'")[target])
save_dict_as_json(data=metrics, path=Path('../scores/test_feateng_xgboost3.json'))
metrics

  0%|          | 0/92 [00:00<?, ?it/s]

JSON file saved at: ../scores/test_feateng_xgboost3.json


{'MAE': 0.5837337374687195,
 'RMSE': 0.9633957147598267,
 'RMSLE': 0.31831592321395874,
 'R2': 0.5637285709381104}