In [1]:
%load_ext autoreload
%autoreload 2

In [31]:
import sys
import warnings
warnings.filterwarnings("ignore")

sys.path.append('../src')

import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit, cross_validate


from sales_project.models import ClippingRegressor
from sales_project.metrics import mean_cv_scores, evaluate
from sales_project.utils import save_predictions, save_pkl, save_dict_as_json

In [27]:
df = pd.read_csv('../data/artifacts/cleaned_data.csv', parse_dates=['date'])
df_submission = df.query('is_submission == True')
df = df.query('is_submission == False')

# LightGBM. Kaggle RMSLE score: 0.67209

In [29]:
target = "relative_sales"
num_cols = ["transactions", "dcoilwtico", "onpromotion"]
cat_cols = ["store_nbr", "family", "city", "state", "type", "cluster"]

In [32]:
def init_pipeline(num_cols, cat_cols, min_value, max_value):

    preprocessor = ColumnTransformer(
        [
            ("num", "passthrough", num_cols),
            (
                "non-binary",
                CatBoostEncoder(cols=cat_cols),
                cat_cols,
            ),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )

    model = ClippingRegressor(
        base_estimator=LGBMRegressor(
                verbose=-1,
                n_jobs=-1,
                random_state=42,
        ),
        min_value=min_value,
        max_value=max_value,
    )

    pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("model", model),
        ]
    )

    return pipeline

In [5]:
pipeline = init_pipeline(num_cols=num_cols, cat_cols=cat_cols, min_value=1e-8, max_value=df[target].max())
pipeline.fit(df, df[target])
save_pkl(model=pipeline, path=Path('../models/baseline_lightgbm4.pkl'))

df_submission['sales'] = pipeline.predict(df_submission) * df_submission['median_sales_over_family']
save_predictions(df_submission, filename='baseline_lightgbm4.csv')

Model file saved at: ../models/baseline_lightgbm4.pkl
csv file saved at: ../data/predictions/baseline_lightgbm4.csv


In [7]:
pipeline = init_pipeline(num_cols=num_cols, cat_cols=cat_cols, min_value=1e-8, max_value=df[target].max())
pipeline.fit(df.query("subset == 'train'"), df.query("subset == 'train'")[target])

y_pred = pipeline.predict(df.query("subset == 'test'"))
metrics = evaluate(df.query("subset == 'test'")[target], y_pred)
save_dict_as_json(data=metrics, path=Path('../scores/test_baseline_lightgbm4.json'))
metrics

JSON file saved at: ../scores/test_baseline_lightgbm4.json


{'MAE': 0.606148470768601,
 'RMSE': 0.943270239761125,
 'RMSLE': 0.3243373281527902,
 'R2': 0.5817657581143305}

In [39]:
tscv = TimeSeriesSplit(n_splits=5, test_size=int(0.1 * len(df.query("subset == 'train'"))))

cv_res = cross_validate(
    pipeline,
    df.query("subset == 'train'"),
    df.query("subset == 'train'")[target],
    cv=tscv,
    n_jobs=1,
    scoring=[
        'neg_mean_absolute_error',
        'neg_root_mean_squared_error',
        'neg_root_mean_squared_log_error',
        'r2',
    ],
    verbose=2
)
cv_res = mean_cv_scores(cv_res)
save_dict_as_json(data=cv_res, path=Path('../scores/cv_baseline_lightgbm4.json'))
cv_res

[CV] END .................................................... total time=   9.5s
[CV] END .................................................... total time=  11.5s
[CV] END .................................................... total time=  13.6s
[CV] END .................................................... total time=  11.5s
[CV] END .................................................... total time=  12.4s
JSON file saved at: ../scores/cv_baseline_lightgbm4.json


{'fit_time': 11.3605,
 'score_time': 0.3527,
 'test_mean_absolute_error': 0.5822,
 'test_root_mean_squared_error': 0.9468,
 'test_root_mean_squared_log_error': 0.3464,
 'test_r2': 0.5496}

# XGBoost. Kaggle RMSLE score: 0.64114

In [41]:
def init_pipeline(num_cols, cat_cols, min_value, max_value):

    preprocessor = ColumnTransformer(
        [
            ("num", "passthrough", num_cols),
            (
                "non-binary",
                CatBoostEncoder(cols=cat_cols),
                cat_cols,
            ),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )

    model = ClippingRegressor(
        XGBRegressor(
            verbosity=0,
            n_jobs=-1,
            random_state=42,
        ),
        min_value=min_value,
        max_value=max_value,
    )

    pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("model", model),
        ]
    )

    return pipeline

In [7]:
pipeline = init_pipeline(num_cols=num_cols, cat_cols=cat_cols, min_value=1e-8, max_value=df[target].max())
pipeline.fit(df, df[target])
save_pkl(model=pipeline, path=Path('../models/baseline_xgboost4.pkl'))

df_submission['sales'] = pipeline.predict(df_submission) * df_submission['median_sales_over_family']
save_predictions(df_submission, filename='baseline_xgboost4.csv')

Model file saved at: ../models/baseline_xgboost4.pkl
csv file saved at: ../data/predictions/baseline_xgboost4.csv


In [8]:
pipeline = init_pipeline(num_cols=num_cols, cat_cols=cat_cols, min_value=1e-8, max_value=df[target].max())
pipeline.fit(df.query("subset == 'train'"), df.query("subset == 'train'")[target])

y_pred = pipeline.predict(df.query("subset == 'test'"))
metrics = evaluate(df.query("subset == 'test'")[target], y_pred)
save_dict_as_json(data=metrics, path=Path('../scores/test_baseline_xgboost4.json'))
metrics

JSON file saved at: ../scores/test_baseline_xgboost4.json


{'MAE': 0.5744371597651557,
 'RMSE': 0.8934911678901186,
 'RMSLE': 0.31138630013050395,
 'R2': 0.624743823493789}

In [43]:
tscv = TimeSeriesSplit(n_splits=5, test_size=int(0.1 * len(df.query("subset == 'train'"))))

cv_res = cross_validate(
    pipeline,
    df.query("subset == 'train'"),
    df.query("subset == 'train'")[target],
    cv=tscv,
    n_jobs=1,
    scoring=[
        'neg_mean_absolute_error',
        'neg_root_mean_squared_error',
        'neg_root_mean_squared_log_error',
        'r2',
    ],
    verbose=2
)
cv_res = mean_cv_scores(cv_res)
save_dict_as_json(data=cv_res, path=Path('../scores/cv_baseline_xgboost4.json'))
cv_res

[CV] END .................................................... total time=   8.6s
[CV] END .................................................... total time=   7.7s
[CV] END .................................................... total time=   9.0s
[CV] END .................................................... total time=   9.9s
[CV] END .................................................... total time=  11.2s
JSON file saved at: ../scores/cv_baseline_xgboost4.json


{'fit_time': 8.9555,
 'score_time': 0.3054,
 'test_mean_absolute_error': 0.5606,
 'test_root_mean_squared_error': 0.9198,
 'test_root_mean_squared_log_error': 0.3385,
 'test_r2': 0.5738}