In [1]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../src')

import pandas as pd
pd.set_option('display.max_rows', 100)

import cudf
from pathlib import Path

from sales_project.evaluations import evaluate
from sales_project.utils import save_predictions
from sales_project.stacking_ensemble import StackingEnsemble



In [2]:
target = 'item_cnt_month'
timestamp_col = 'date_block_num'

features = list(pd.read_parquet("../data/artifacts/df_submission.parquet").columns)
features.remove(target)
features.remove(timestamp_col)
features.remove('shop_id')
features.remove('item_id')
features.remove('shop_open_days')

# Stacking model 1 -> 0.96608

In [3]:
ensemble = StackingEnsemble(Path('../params/stacking_model.yaml'))

ensemble.fit(
    data_file_path=Path("../data/artifacts/df3.parquet"),
    feats=features,
    target=target,
    timestamp_col=timestamp_col,
    first_level_timestamps=list(range(1, 17)),
    second_level_timestamps=list(range(17, 33)),
    is_cudf=False,
)

First level models have already been fitted. No need to read first level data.
Reading second level data...
Predicting using first level models...


  0%|          | 0/2 [00:00<?, ?it/s]

Fitting meta model XGBRegressor...
pkl file saved at: ../models/meta_XGBRegressor_2024-08-15_23:22:10.pkl
yaml file saved at: ../params/stacking_model.yaml


<sales_project.stacking_ensemble.StackingEnsemble at 0x7f021a2e47d0>

In [4]:
df_test = pd.read_parquet("../data/artifacts/df3.parquet").query(f"{timestamp_col} == 33")
df_test[target] = df_test[target].clip(0, 20)
df_test = ensemble.predict(df=df_test, feats=features)

for col in ['LGBMRegressor_pred', 'XGBRegressor_pred', 'XGBRegressor_metapred']:
    print(col, evaluate(df_test, target, col))

  0%|          | 0/2 [00:00<?, ?it/s]

LGBMRegressor_pred {'MAE': 0.29317721518047557, 'MSE': 0.6210662809199939, 'RMSE': 0.7880775855967443, 'R2': 0.37265683834537966, 'MAPE': 608010645592480.0, 'SMAPE': nan}
XGBRegressor_pred {'MAE': 0.3232717216014862, 'MSE': 0.6980032920837402, 'RMSE': 0.8354659131788323, 'R2': 0.2949422597885132, 'MAPE': 733291778932736.0, 'SMAPE': nan}
XGBRegressor_metapred {'MAE': 0.30103766918182373, 'MSE': 0.591895580291748, 'RMSE': 0.7693475029476264, 'R2': 0.4021223187446594, 'MAPE': 666325923921920.0, 'SMAPE': nan}


In [15]:
df_submission = pd.read_parquet("../data/artifacts/df_submission.parquet")
df_submission = ensemble.predict(df=df_submission, feats=features)
df_submission[target] = df_submission['XGBRegressor_metapred']
save_predictions(df_submission[['shop_id', 'item_id', target]], 'stacking_test2.csv')

  0%|          | 0/2 [00:00<?, ?it/s]

csv file saved at: ../data/predictions/stacking_test2.csv


# Stacking model 2 -> 0.96435

In [3]:
ensemble = StackingEnsemble(Path('../params/stacking_model2.yaml'))

ensemble.fit(
    data_file_path=Path("../data/artifacts/df3.parquet"),
    feats=features,
    target=target,
    timestamp_col=timestamp_col,
    first_level_timestamps=list(range(1, 17)),
    second_level_timestamps=list(range(17, 33)),
    is_cudf=False,
)

First level models have already been fitted. No need to read first level data.
Reading second level data...
Predicting using first level models...


  0%|          | 0/2 [00:00<?, ?it/s]

Fitting meta model LGBMRegressor...
pkl file saved at: ../models/meta_LGBMRegressor_2024-08-15_23:24:38.pkl
yaml file saved at: ../params/stacking_model2.yaml


<sales_project.stacking_ensemble.StackingEnsemble at 0x7fb2a3168410>

In [6]:
df_test = pd.read_parquet("../data/artifacts/df3.parquet").query(f"{timestamp_col} == 33")
df_test[target] = df_test[target].clip(0, 20)
df_test = ensemble.predict(df=df_test, feats=features)

for col in ['LGBMRegressor_pred', 'XGBRegressor_pred', 'LGBMRegressor_metapred']:
    print(col, evaluate(df_test, target, col))

LGBMRegressor_pred {'MAE': 0.29317721518047557, 'MSE': 0.6210662809199939, 'RMSE': 0.7880775855967443, 'R2': 0.37265683834537966, 'MAPE': 608010645592480.0, 'SMAPE': nan}
XGBRegressor_pred {'MAE': 0.3232717216014862, 'MSE': 0.6980032920837402, 'RMSE': 0.8354659131788323, 'R2': 0.2949422597885132, 'MAPE': 733291778932736.0, 'SMAPE': nan}
LGBMRegressor_metapred {'MAE': 0.30686542306348585, 'MSE': 0.6063440972183074, 'RMSE': 0.7786809983673079, 'R2': 0.3875278135594824, 'MAPE': 698288598966636.6, 'SMAPE': nan}


In [7]:
df_submission = pd.read_parquet("../data/artifacts/df_submission.parquet")
df_submission = ensemble.predict(df=df_submission, feats=features)
df_submission[target] = df_submission['LGBMRegressor_metapred']
save_predictions(df_submission[['shop_id', 'item_id', target]], 'stacking_test3.csv')

  0%|          | 0/2 [00:00<?, ?it/s]

csv file saved at: ../data/predictions/stacking_test3.csv
