In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import warnings
warnings.filterwarnings("ignore")

sys.path.append('../src')

import pandas as pd
from pathlib import Path
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold, cross_validate

from sales_project.models import ClippingRegressor
from sales_project.metrics import mean_cv_scores, evaluate
from sales_project.utils import save_predictions, save_pkl, save_dict_as_json

In [3]:
df = pd.read_csv('../data/artifacts/cleaned_data.csv', index_col='id', parse_dates=['date'])
df_submission = df.query('is_submission == True')
df = df.query('is_submission == False')

# LightGBM. Kaggle RMSLE score: 0.69694

In [4]:
target = "relative_sales"
num_cols = ["transactions", "dcoilwtico", "onpromotion"]
cat_cols = ["store_nbr", "family", "city", "state", "type", "cluster"]

In [5]:
def init_pipeline(df, num_cols, cat_cols, min_value, max_value):

    preprocessor = ColumnTransformer(
        [
            ("num", "passthrough", num_cols),
            (
                "non-binary",
                CatBoostEncoder(cols=cat_cols),
                cat_cols,
            ),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )

    model = ClippingRegressor(
        base_estimator=LGBMRegressor(
                verbose=-1,
                n_jobs=-1,
                random_state=42,
        ),
        min_value=min_value,
        max_value=max_value,
    )

    pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("model", model),
        ]
    )

    return pipeline

In [8]:
pipeline = init_pipeline(df, num_cols=num_cols, cat_cols=cat_cols, min_value=1e-8, max_value=df[target].max())
pipeline.fit(df, df[target])
save_pkl(model=pipeline, path=Path('../models/baseline_lightgbm.pkl'))

df_submission['sales'] = pipeline.predict(df_submission) * df_submission['median_sales_over_family']
save_predictions(df_submission, filename='baseline_lightgbm3.csv')

Model file saved at: ../models/baseline_lightgbm.pkl
csv file saved at: ../data/predictions/baseline_lightgbm_v3.csv


In [6]:
pipeline = init_pipeline(df, num_cols=num_cols, cat_cols=cat_cols, min_value=1e-8, max_value=df[target].max())
pipeline.fit(df.query("subset == 'train'"), df.query("subset == 'train'")[target])

In [8]:
y_pred = pipeline.predict(df.query("subset == 'test'"))
metrics = evaluate(df.query("subset == 'test'")[target], y_pred)
save_dict_as_json(data=metrics, path=Path('../scores/test_baseline_lightgbm3.json'))
metrics

JSON file saved at: ../scores/test_baseline_lightgbm3.json


{'MAE': 0.6629876241649449,
 'RMSE': 0.8874803658515329,
 'RMSLE': 0.4529360834733795,
 'R2': 0.2046882260814169}

In [9]:
cv_strategy = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42,
)
cv_res = cross_validate(
    pipeline,
    df.query("subset == 'train'"),
    df.query("subset == 'train'")[target],
    cv=cv_strategy,
    n_jobs=1,
    scoring=[
        'neg_mean_absolute_error',
        'neg_root_mean_squared_error',
        'neg_root_mean_squared_log_error',
        'r2',
    ],
    verbose=2
)
cv_res = mean_cv_scores(cv_res)
save_dict_as_json(data=cv_res, path=Path('../scores/cv_baseline_lightgbm3.json'))
cv_res

[CV] END .................................................... total time=  10.4s
[CV] END .................................................... total time=  10.2s
[CV] END .................................................... total time=   8.8s
[CV] END .................................................... total time=   9.8s
[CV] END .................................................... total time=   9.1s
JSON file saved at: ../scores/cv_baseline_lightgbm3.json


{'fit_time': 9.0642,
 'score_time': 0.5682,
 'test_mean_absolute_error': 0.6667,
 'test_root_mean_squared_error': 0.8905,
 'test_root_mean_squared_log_error': 0.4545,
 'test_r2': 0.1996}

# XGBoost. Kaggle RMSLE score: 0.67744

In [11]:
def init_pipeline(df, num_cols, cat_cols, min_value, max_value):

    preprocessor = ColumnTransformer(
        [
            ("num", "passthrough", num_cols),
            (
                "non-binary",
                CatBoostEncoder(cols=cat_cols),
                cat_cols,
            ),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )

    model = ClippingRegressor(
        XGBRegressor(
            verbosity=0,
            n_jobs=-1,
            random_state=42,
        ),
        min_value=min_value,
        max_value=max_value,
    )

    pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("model", model),
        ]
    )

    return pipeline

In [12]:
pipeline = init_pipeline(df, num_cols=num_cols, cat_cols=cat_cols, min_value=1e-8, max_value=df[target].max())
pipeline.fit(df, df[target])
save_pkl(model=pipeline, path=Path('../models/baseline_xgboost.pkl'))

df_submission['sales'] = pipeline.predict(df_submission) * df_submission['median_sales_over_family']
save_predictions(df_submission, filename='baseline_xgboost3.csv')

Model file saved at: ../models/baseline_xgboost.pkl
csv file saved at: ../data/predictions/baseline_xgboost_v3.csv


In [14]:
pipeline = init_pipeline(df, num_cols=num_cols, cat_cols=cat_cols, min_value=1e-8, max_value=df[target].max())
pipeline.fit(df.query("subset == 'train'"), df.query("subset == 'train'")[target])

In [15]:
y_pred = pipeline.predict(df.query("subset == 'test'"))
metrics = evaluate(df.query("subset == 'test'")[target], y_pred)
save_dict_as_json(data=metrics, path=Path('../scores/test_baseline_xgboost3.json'))
metrics

JSON file saved at: ../scores/test_baseline_xgboost3.json


{'MAE': 0.6803333087321959,
 'RMSE': 0.9264458771450214,
 'RMSLE': 0.4634903140086174,
 'R2': 0.13331753407048186}

In [16]:
cv_strategy = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42,
)
cv_res = cross_validate(
    pipeline,
    df.query("subset == 'train'"),
    df.query("subset == 'train'")[target],
    cv=cv_strategy,
    n_jobs=1,
    scoring=[
        'neg_mean_absolute_error',
        'neg_root_mean_squared_error',
        'neg_root_mean_squared_log_error',
        'r2',
    ],
    verbose=2
)
cv_res = mean_cv_scores(cv_res)
save_dict_as_json(data=cv_res, path=Path('../scores/cv_baseline_xgboost3.json'))
cv_res

[CV] END .................................................... total time=  10.1s
[CV] END .................................................... total time=   9.9s
[CV] END .................................................... total time=  10.0s
[CV] END .................................................... total time=   9.2s
[CV] END .................................................... total time=  10.0s
JSON file saved at: ../scores/cv_baseline_xgboost3.json


{'fit_time': 9.3403,
 'score_time': 0.5162,
 'test_mean_absolute_error': 0.6785,
 'test_root_mean_squared_error': 0.9272,
 'test_root_mean_squared_log_error': 0.4628,
 'test_r2': 0.132}