In [90]:
import sys
import warnings
import re

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GroupKFold
from sklearn.metrics import (
    mean_absolute_percentage_error,
    mean_squared_error,
    d2_tweedie_score,
)
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder

from category_encoders import LeaveOneOutEncoder, TargetEncoder

DATA_PATH = "../data"

In [91]:
# Load excel file
df = pd.read_excel(DATA_PATH + "/consumo_material_clean.xlsx")

## Preprocessing

In [92]:
# Separate code into two columns
new_columns = df["CODIGO"].str.extract(r"([a-zA-Z]+)([0-9]+)", expand=False)
df["CODIGO_CLASS"] = new_columns[0]
df["CODIGO_NUM"] = new_columns[1]
df.drop(columns=["CODIGO"], inplace=True)

In [93]:
# FECHAPEDIDO to datetime in day/month/year format
df["FECHAPEDIDO"] = pd.to_datetime(df["FECHAPEDIDO"], dayfirst=True)
df.sort_values(by=["FECHAPEDIDO"], inplace=True)
df.reset_index(drop=True, inplace=True)

  df["FECHAPEDIDO"] = pd.to_datetime(df["FECHAPEDIDO"], dayfirst=True)


In [94]:
# separate ORIGEN in three columns by '-'
origin_separated_columns = df["ORIGEN"].str.split("-", expand=True)
df["PURCHASING_HOSPITAL"] = origin_separated_columns[1]
df["PURCHASING_DEPARTMENT"] = origin_separated_columns[2]
df.drop(columns=["ORIGEN"], inplace=True)

In [95]:
# drop duplicates
df.drop_duplicates(inplace=True)

In [96]:
df.columns

Index(['FECHAPEDIDO', 'NUMERO', 'REFERENCIA', 'CANTIDADCOMPRA',
       'UNIDADESCONSUMOCONTENIDAS', 'PRECIO', 'IMPORTELINEA', 'TIPOCOMPRA',
       'TGL', 'PRODUCTO', 'CODIGO_CLASS', 'CODIGO_NUM', 'PURCHASING_HOSPITAL',
       'PURCHASING_DEPARTMENT'],
      dtype='object')

In [97]:
# basic date features
def generate_date_features(df):
    df["YEAR"] = df["FECHAPEDIDO"].dt.year
    df["MONTH"] = np.sin(2 * np.pi * df["FECHAPEDIDO"].dt.month / 12)
    df["DAYOFMONTH"] = np.sin(2 * np.pi * df["FECHAPEDIDO"].dt.day / 31)
    df["DAYOFYEAR"] = np.sin(2 * np.pi * df["FECHAPEDIDO"].dt.dayofyear / 365)
    return df

In [98]:
def add_timeseries_features(df):
    # MEANS
    df["ROLLING_MEAN_3M"] = df["CANTIDADCOMPRA"].rolling(90).mean()
    df["WEIGHTED_MEAN_3M"] = (
        df["CANTIDADCOMPRA"]
        .rolling(90)
        .apply(lambda x: np.average(x, weights=range(1, len(x) + 1)))
    )
    df["ROLLING_MEAN_1Y"] = df["CANTIDADCOMPRA"].rolling(365).mean()
    df["WEIGHTED_MEAN_1Y"] = (
        df["CANTIDADCOMPRA"]
        .rolling(365)
        .apply(lambda x: np.average(x, weights=range(1, len(x) + 1)))
    )
    df["EWMA_3M"] = df["CANTIDADCOMPRA"].ewm(span=90).mean()
    df["EWMA_1Y"] = df["CANTIDADCOMPRA"].ewm(span=365).mean()

    # LAGS
    df["SHIFT_1W"] = df["CANTIDADCOMPRA"].shift(7)
    df["SHIFT_2W"] = df["CANTIDADCOMPRA"].shift(14)
    df["SHIFT_1M"] = df["CANTIDADCOMPRA"].shift(30)
    df["SHIFT_3M"] = df["CANTIDADCOMPRA"].shift(90)
    df["SHIFT_1Y"] = df["CANTIDADCOMPRA"].shift(365)

    return df

In [99]:
def generate_train_test_df(full_df):
    # Get train and test sets
    train = full_df[full_df["YEAR"] < 2023]
    X_train = train.drop(columns=["CANTIDADCOMPRA", "FECHAPEDIDO"])
    y_train = train["CANTIDADCOMPRA"]

    test = full_df[full_df["YEAR"] == 2023]
    X_test = test.drop(columns=["CANTIDADCOMPRA", "FECHAPEDIDO"])
    y_test = test["CANTIDADCOMPRA"]

    return train, X_train, y_train, test, X_test, y_test

In [100]:
def smape_score(A, F):
    return 100 / len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [101]:
def train_model_eval(X_train, y_train, X_test, y_test):
    model = XGBRegressor(random_state=42, n_estimators=1000)
    model.fit(X_train, y_train)

    y_test_pred = model.predict(X_test)
    y_test_pred = np.maximum(y_test_pred, 0)  # clip negative predictions to 0

    test_loss = mean_squared_error(y_test, y_test_pred, squared=False)
    mape_error = mean_absolute_percentage_error(y_test, y_test_pred)
    tweedie = d2_tweedie_score(y_test, y_test_pred)
    smape_err = smape_score(y_test, y_test_pred)

    return test_loss, mape_error, smape_err, tweedie

In [103]:
# iterate over products
columns = [
    "FECHAPEDIDO",
    "CANTIDADCOMPRA",
    "PURCHASING_HOSPITAL",
    "PURCHASING_DEPARTMENT",
]

product_losses = pd.DataFrame(columns=["PRODUCT", "Tweedie", "MSE", "SMAPE"])
for product in df["CODIGO_NUM"].unique():
    partial_df = df[df["CODIGO_NUM"] == product]
    partial_df = partial_df.groupby(columns).sum().reset_index()

    loo = LeaveOneOutEncoder()
    partial_df["PURCHASING_HOSPITAL"] = loo.fit_transform(
        partial_df["PURCHASING_HOSPITAL"], partial_df["CANTIDADCOMPRA"]
    )

    loo = TargetEncoder()
    partial_df["PURCHASING_DEPARTMENT"] = loo.fit_transform(
        partial_df["PURCHASING_DEPARTMENT"], partial_df["CANTIDADCOMPRA"]
    )

    partial_df = partial_df[columns]
    partial_df = generate_date_features(partial_df)
    partial_df = add_timeseries_features(partial_df)

    is_2023_in_df = 2023 in partial_df["YEAR"].unique()
    product_blacklist = ["85758", "73753"]  # stops selling on 2023
    if not is_2023_in_df or product in product_blacklist:
        continue

    train, X_train, y_train, test, X_test, y_test = generate_train_test_df(partial_df)
    test_loss, mape_error, smape, tweedie = train_model_eval(
        X_train, y_train, X_test, y_test
    )

    product_losses = pd.concat(
        [
            product_losses,
            pd.DataFrame(
                [[product, tweedie, test_loss, smape]],
                columns=["PRODUCT", "Tweedie", "MSE", "SMAPE"],
            ),
        ]
    )

In [None]:
product_losses.sort_values(by=["SMAPE"], ascending=False)

Unnamed: 0,PRODUCT,Tweedie,MSE,SMAPE
0,73753,-0.383431,534.810329,64.420002
0,65007,-1.666662,81.00919,58.333215
0,66071,-0.882426,1310.305305,44.794663
0,85769,0.480506,52.962525,44.711621
0,46846,0.085056,3.128972,44.65539
0,64544,-0.100132,316.910496,40.64338
0,64751,-0.693816,1051.849186,36.210155
0,64488,0.63579,162.203508,36.050012
0,64764,-0.143917,1259.575953,34.822913
0,67835,-1.065194,95.805219,32.911103


In [None]:
product_losses["SMAPE"].mean(), product_losses["MSE"].mean(), product_losses[
    product_losses["Tweedie"] != -np.inf
]["Tweedie"].mean()

(23.684781939151666, 215.0087995100757, -0.11407273352736806)

28.576833408153984