## Setup

In [92]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from datetime import timedelta
import random

In [3]:
datos_unidos = pd.read_csv("../data/procesados/datos_unidos.csv")

In [3]:
datos_unidos.columns

Index(['SKU', 'DATE', 'STORE_ID', 'PRICE', 'QUANTITY', 'TOTAL_SALES', 'REGION',
       'CITY', 'STATE', 'STORE_TYPE', 'OPENDATE', 'CLOSEDATE',
       'STORE_SUBGROUP_DATE_ID', 'CATEGORY', 'GROUP', 'SUBGROUP', 'GROUP_TYPE',
       'PRICE_GROUP_ID', 'BRAND', 'INITIAL_TICKET_PRICE', 'BASE_PRICE',
       'COSTOS', 'YEAR_OPEN', 'YEAR_CLOSE', 'MONTH_OPEN', 'MONTH_CLOSE',
       'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'WEEK'],
      dtype='object')

In [4]:
cols_categoricas = ['SKU', 'STORE_ID', 'REGION',
       'CITY', 'STATE', 'STORE_TYPE',  'CATEGORY', 'GROUP', 'SUBGROUP', 'GROUP_TYPE',
       'PRICE_GROUP_ID', 'BRAND', "DAY_OF_WEEK"]

target = "TOTAL_SALES"

In [5]:
for col in cols_categoricas:
    datos_unidos[col] = datos_unidos[col].astype("category")

## Feature aggregation

Primero, haremos el promedio de ventas por SKU X STORE_ID de los últimos 7, 30 y 90 días, para compensar el hecho de que no es posible (por limitaciones computacionales) completar todo el dataset con los días en que no hubo transacciones de un producto.

De esta manera, el modelo podrá dilucidar las épocas en donde no hay ventas de ciertos productos.

In [8]:
datos_unidos = datos_unidos.sort_values("DATE").reset_index(drop=True)
# Guardamos el índice donde cambia la fecha para acceso rápido
cambios_dia = datos_unidos["DATE"].ne(datos_unidos["DATE"].shift()).to_numpy().nonzero()[0]
fechas_unicas = datos_unidos["DATE"].unique()

# Lista única de combinaciones SKU-STORE_ID
combinaciones = datos_unidos[["SKU", "STORE_ID"]].drop_duplicates()


In [9]:
def rellenar_faltantes(df, fecha):
    # Todas las combinaciones para esta fecha
    comb_fecha = combinaciones.copy()
    comb_fecha["DATE"] = fecha
    # Merge para meter TOTAL_SALES=0 donde falta
    df_completo = comb_fecha.merge(df, on=["SKU", "STORE_ID", "DATE"], how="left")
    df_completo["TOTAL_SALES"] = df_completo["TOTAL_SALES"].fillna(0)
    return df_completo

In [10]:
buffer = pd.DataFrame()
resultados = []
windows = [7, 30, 90]

for window in windows:
    datos_unidos[f"SKU_STORE_mean_{window}D"] = pd.NA

    for fecha in fechas_unicas:
        # Datos del día actual
        df_dia = datos_unidos.loc[datos_unidos["DATE"] == fecha, ["SKU", "STORE_ID", "DATE", "TOTAL_SALES"]]
        df_dia_completo = rellenar_faltantes(df_dia, fecha)

        # Agregar al buffer
        buffer = pd.concat([buffer, df_dia_completo], ignore_index=True)

        # Mantener sólo los últimos window+1 días (para limitar memoria)
        if buffer["DATE"].nunique() > window+1:
            fecha_mas_vieja = buffer["DATE"].min()
            buffer = buffer[buffer["DATE"] != fecha_mas_vieja]

        # Filas originales del día actual
        df_original_dia = datos_unidos.loc[datos_unidos["DATE"] == fecha,
                                        ["SKU", "STORE_ID", "DATE", "TOTAL_SALES"]]

        # Calcular promedio con los días previos que haya 
        dias_previos = sorted(buffer["DATE"].unique())[:-1]  # todos menos el actual
        
        if len(dias_previos) > 0:
            # Tomar como máximo window días previos
            dias_a_usar = dias_previos[-window:]
            df_prev = buffer[buffer["DATE"].isin(dias_a_usar)]
            media_prev = df_prev.groupby(["SKU", "STORE_ID"], observed=False)["TOTAL_SALES"].mean().reset_index()
            media_prev["DATE"] = fecha
            media_prev.rename(columns={"TOTAL_SALES": f"SKU_STORE_mean_{window}D"}, inplace=True)

            # Actualizar directamente en el dataset original
            idx_update = datos_unidos.index[datos_unidos["DATE"] == fecha]
            merged = datos_unidos.loc[idx_update, ["SKU", "STORE_ID", "DATE"]].merge(
                media_prev, on=["SKU", "STORE_ID", "DATE"], how="left")

            # Si no se creó la columna en el merge, la creamos con NaN
            if f"SKU_STORE_mean_{window}D" not in merged.columns:
                merged[f"SKU_STORE_mean_{window}D"] = pd.NA

            datos_unidos.loc[idx_update, f"SKU_STORE_mean_{window}D"] = merged[f"SKU_STORE_mean_{window}D"].values

    datos_unidos.fillna({f"SKU_STORE_mean_{window}D":0}, inplace=True)

  datos_unidos.fillna({f"SKU_STORE_mean_{window}D":0}, inplace=True)
  datos_unidos.fillna({f"SKU_STORE_mean_{window}D":0}, inplace=True)
  datos_unidos.fillna({f"SKU_STORE_mean_{window}D":0}, inplace=True)


Ahora, haremos el promedio y desviación estándar de las ventas por subgrupo y por categoría, de manera que el modelo pueda entender mejor los cambios de ventas por épocas del año de grupos más grandes de productos.

In [11]:
# Agrupar por SUBGROUP y DATE para obtener total diario
def add_rolling_mean_std(datos_unidos, group, windows=[7,30,90]):
    for window in windows:
        nueva_col_mean = f'{group}_mean_{window}D'
        nueva_col_std = f'{group}_std_{window}D'

        subgroup_daily = (
            datos_unidos.groupby([group, 'DATE'], as_index=False)['TOTAL_SALES']
            .sum()
            .sort_values([group, 'DATE'])
        )

        # Calcular promedio móvil excluyendo el día actual
        # Shift para no incluir el valor del día actual
        subgroup_daily[nueva_col_mean] = (
            subgroup_daily.groupby(group)['TOTAL_SALES']
            .apply(lambda x: x.shift().rolling(window, min_periods=1).mean().fillna(0)) # para el primer dia el valor es 0
            .reset_index(drop=True)
        )

        subgroup_daily[nueva_col_std] = (
        subgroup_daily.groupby(group)['TOTAL_SALES']
        .apply(lambda x: x.shift().rolling(window=7, min_periods=1).std().fillna(0)) # para el primer dia el valor es 0
        .reset_index(drop=True)
        )

        # Asignar columnas al dataframe original
        merge_cols = [group, 'DATE', nueva_col_mean, nueva_col_std]
        merged = datos_unidos.merge(subgroup_daily[merge_cols], on=[group, 'DATE'], how='left')

        datos_unidos[nueva_col_mean] = merged[nueva_col_mean]
        datos_unidos[nueva_col_std] = merged[nueva_col_std]


In [12]:
add_rolling_mean_std(datos_unidos, group="CATEGORY", windows=[30, 90, 180])
add_rolling_mean_std(datos_unidos, group="SUBGROUP", windows=[30, 90, 180])

  datos_unidos.groupby([group, 'DATE'], as_index=False)['TOTAL_SALES']
  subgroup_daily.groupby(group)['TOTAL_SALES']
  subgroup_daily.groupby(group)['TOTAL_SALES']
  datos_unidos.groupby([group, 'DATE'], as_index=False)['TOTAL_SALES']
  subgroup_daily.groupby(group)['TOTAL_SALES']
  subgroup_daily.groupby(group)['TOTAL_SALES']
  datos_unidos.groupby([group, 'DATE'], as_index=False)['TOTAL_SALES']
  subgroup_daily.groupby(group)['TOTAL_SALES']
  subgroup_daily.groupby(group)['TOTAL_SALES']
  datos_unidos.groupby([group, 'DATE'], as_index=False)['TOTAL_SALES']
  subgroup_daily.groupby(group)['TOTAL_SALES']
  subgroup_daily.groupby(group)['TOTAL_SALES']
  datos_unidos.groupby([group, 'DATE'], as_index=False)['TOTAL_SALES']
  subgroup_daily.groupby(group)['TOTAL_SALES']
  subgroup_daily.groupby(group)['TOTAL_SALES']
  datos_unidos.groupby([group, 'DATE'], as_index=False)['TOTAL_SALES']
  subgroup_daily.groupby(group)['TOTAL_SALES']
  subgroup_daily.groupby(group)['TOTAL_SALES']


In [13]:
datos_unidos.to_csv("../data/procesados/data_train.csv", index=False)

## Test de modelos

### Walk-forward

In [15]:
def walk_forward_forecast(df, model, features, target, train_days=365, step_days=30, forecast_days=7):
    """
    Realiza un walk-forward, entrenando el modelo con una expanding window y prediciendo los 
    proximos dias

    df: DataFrame 
    model: modelo sklearn
    train_days: tamaño inicial del set de entrenamiento en días
    step_days: cuántos días se suman en cada iteración
    forecast_days: horizonte de predicción en días
    """
    # Aseguramos orden por fecha
    df["DATE"] = pd.to_datetime(df["DATE"])
    df = df.sort_values("DATE")
    
    results = []
    min_date = df["DATE"].min()
    max_date = df["DATE"].max()

    start_train_end = min_date + timedelta(days=train_days)

    while start_train_end + timedelta(days=forecast_days) <= max_date:
        # Definir ventanas
        train_data = df[df["DATE"] < start_train_end]
        test_data = df[(df["DATE"] >= start_train_end) &
                       (df["DATE"] < start_train_end + timedelta(days=forecast_days))]

        if len(test_data) == 0:
            break

        # Features y target
        X_train = train_data[features]
        y_train = train_data[target]
        X_test = test_data[features]
        y_test = test_data[target]

        # Entrenar
        model.fit(X_train, y_train)

        # Predicciones y métricas
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        r2_train = r2_score(y_train, y_train_pred)
        r2_test = r2_score(y_test, y_test_pred)

        results.append({
            "train_end_date": start_train_end,
            "r2_train": r2_train,
            "r2_test": r2_test
        })

        # Avanzar ventana
        start_train_end += timedelta(days=step_days)

    return pd.DataFrame(results)

### LightGBM

In [None]:
import lightgbm as lgb

In [10]:
def walk_forward_lightgbm(df, features, target_col, date_col, categorical_cols,
                          train_days=365, step_days=30, forecast_days=7,
                          params=None):
    """
    df: DataFrame con features + target
    target_col: nombre de la columna objetivo (ej. 'TOTAL_SALES')
    date_col: columna con la fecha
    categorical_cols: lista de columnas categóricas (deben ser dtype 'category')
    train_days, step_days, forecast_days: enteros en días
    params: dict de parámetros LightGBM
    """

    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(date_col)

    results = []
    min_date = df[date_col].min()
    max_date = df[date_col].max()
    start_train_end = min_date + timedelta(days=train_days)

    count = 0

    while start_train_end + timedelta(days=forecast_days) <= max_date:
        count+=1

        print(f"Walk-forward: iteracion numero {count}")

        # Train y Test
        train_data = df[df[date_col] < start_train_end]
        test_data = df[(df[date_col] >= start_train_end) &
                       (df[date_col] < start_train_end + timedelta(days=forecast_days))]

        if len(test_data) == 0:
            break

        # Creamos un validation set para early stopping
        valid_days_inner = 7
        train_end_inner = train_data["DATE"].max() - timedelta(days=valid_days_inner)

        train_inner = train_data[train_data["DATE"] <= train_end_inner]
        valid_inner = train_data[train_data["DATE"] > train_end_inner]

        X_train_inner = train_inner[features]
        y_train_inner = train_inner[target_col]
        X_valid_inner = valid_inner[features]
        y_valid_inner = valid_inner[target_col]

        # Dataset LightGBM
        lgb_train = lgb.Dataset(X_train_inner, label=y_train_inner, categorical_feature=categorical_cols)
        lgb_valid = lgb.Dataset(X_valid_inner, label=y_valid_inner, categorical_feature=categorical_cols, reference=lgb_train)

        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_valid],
            valid_names=["train_inner", "valid_inner"]
        )

        # Predicciones
        y_test_pred = model.predict(test_data[features], num_iteration=model.best_iteration)
        y_train_pred = model.predict(X_train_inner,num_iteration=model.best_iteration)
        
        # Métricas
        r2_test = r2_score(test_data[target], y_test_pred)
        r2_train = r2_score(y_train_inner, y_train_pred)


        results.append({
            "train_end_date": start_train_end,
            "r2_train": r2_train,
            "r2_test": r2_test
        })

        start_train_end += timedelta(days=step_days)

    return pd.DataFrame(results)


In [None]:
cols = list(datos_unidos.columns)
features = [col for col in cols if col not in ["DATE", "TOTAL_SALES",
                                               'INITIAL_TICKET_PRICE', 'BASE_PRICE', "COSTOS", 
                                               "OPENDATE", "CLOSEDATE", "QUANTITY", "STORE_SUBGROUP_DATE_ID"] ]

In [13]:
results_lgb = walk_forward_lightgbm(
    df=datos_unidos,
    features=features,
    target_col="TOTAL_SALES",
    date_col="DATE",
    categorical_cols=cols_categoricas,
    train_days=365,
    step_days=30,
    forecast_days=7,
    params={
        "objective": "regression",
        "metric": "rmse",
        "verbosity": 2,
        "learning_rate": 0.01,
        "num_leaves": 500,
        "max_depth": 20,
        "min_data_in_leaf": 50,
        "feature_fraction": 1,
        "bagging_fraction": 1,
        "bagging_freq": 0,
        "early_stopping_round": 20,
        "num_boost_round":1000
    }
)

Walk-forward: iteracion numero 1
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.951242
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.221594
[LightGBM] [Debug] init for col-wise cost 0.068858 seconds, init for row-wise cost 0.241743 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.181673 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1641
[LightGBM] [Info] Number of data points in the train set: 2242841, number of used features: 21
[LightGBM] [Info] Start training from score 299.483629
[LightGBM] [Debug] Trained a tree with leaves = 500 and depth = 15
Training until validation scores don't improve for 20 rounds
[LightGBM] [Debug] Trained a tree with leaves = 500 and depth = 16
[LightGBM] [Debug] Trained a tree with leaves = 500 and dept

In [14]:
results_lgb.to_csv("resultados_test/resultados_lgb5.csv")

In [15]:
results_lgb.mean()

train_end_date    2022-12-27 00:00:00
r2_train                     0.757261
r2_test                      0.676761
dtype: object

## Deploy

### Training

In [6]:
import lightgbm as lgb

In [81]:
features = ['SKU', 'STORE_ID', 'PRICE', 'REGION',
       'CITY', 'STATE', 'STORE_TYPE',  'CATEGORY', 'GROUP', 'SUBGROUP', 'GROUP_TYPE',
       'PRICE_GROUP_ID', 'BRAND', 'YEAR_OPEN', 'YEAR_CLOSE', 'MONTH_OPEN', 'MONTH_CLOSE',
       'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'WEEK']

In [8]:
params={
        "objective": "regression",
        "metric": "rmse",
        "verbosity": 1,
        "learning_rate": 0.01,
        "num_leaves": 200,
        "max_depth": 20,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.9,
        "bagging_freq": 1,
		"num_boost_round" : 100,
    }


In [9]:
data_train = lgb.Dataset(datos_unidos[features], datos_unidos["TOTAL_SALES"], categorical_feature=cols_categoricas)

In [10]:
model = lgb.train(params, data_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.060821 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1649
[LightGBM] [Info] Number of data points in the train set: 6839616, number of used features: 22
[LightGBM] [Info] Start training from score 266.144489


### Grid precios

In [11]:
columnas_extraidas = ['SKU', 'STORE_ID', 'REGION',
       'CITY', 'STATE', 'STORE_TYPE', 'CATEGORY', 'GROUP', 'SUBGROUP', 'GROUP_TYPE',
       'PRICE_GROUP_ID', 'BRAND', 'YEAR_OPEN', 'YEAR_CLOSE', 'MONTH_OPEN', 'MONTH_CLOSE']

In [46]:
# Creamos un dataframe con todas las combinaciones de SKU X STORE_ID
template = datos_unidos[columnas_extraidas].drop_duplicates().reset_index(drop=True)

In [47]:
# Agregamos los ultimos costos de los productos
ultimos_costos = (
    datos_unidos
    .groupby(["SKU", "STORE_ID"], as_index=False)
    .last()[["SKU", "STORE_ID", "COSTOS"]]
)

template = template.merge(ultimos_costos, on=["SKU", "STORE_ID"], how="left")

  .groupby(["SKU", "STORE_ID"], as_index=False)


In [None]:
# Quitamos las tiendas que ya cerraron
# template = template[template["YEAR_CLOSE"] > 2023]

In [None]:
# Hay 152 (numero de tiendas) . 854 (numero de sku) combinaciones
len(template)

129808

In [49]:
# Cada uno de los 7 dias tendra todas las combinaciones
fechas = pd.date_range(start="2024-01-01", periods=7, freq="D")
df_fechas = pd.DataFrame({"DATE": fechas})

template = (
    df_fechas.assign(key=1)
    .merge(template.assign(key=1), on="key")
    .drop(columns="key")
)

In [50]:
# Features agregados
template["DATE"] = pd.to_datetime(template["DATE"])
template["YEAR"] = template["DATE"].dt.year
template["MONTH"] = template["DATE"].dt.month
template["DAY"] = template["DATE"].dt.day
template["DAY_OF_WEEK"] = template["DATE"].dt.day_name()
template["WEEK"] = template["DATE"].dt.isocalendar().week

In [51]:
for col in cols_categoricas:
    template[col] = template[col].astype("category")

In [52]:
template.columns

Index(['DATE', 'SKU', 'STORE_ID', 'REGION', 'CITY', 'STATE', 'STORE_TYPE',
       'CATEGORY', 'GROUP', 'SUBGROUP', 'GROUP_TYPE', 'PRICE_GROUP_ID',
       'BRAND', 'YEAR_OPEN', 'YEAR_CLOSE', 'MONTH_OPEN', 'MONTH_CLOSE',
       'COSTOS', 'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'WEEK'],
      dtype='object')

In [91]:
def crear_price_grid(datos_unidos: pd.DataFrame, n_prices: int = 50):
    """
    Devuelve dict { SKU: array_de_50_precios } usando min/max histórico por SKU.
    """
    price_ranges = datos_unidos.groupby('SKU')['PRICE'].agg(['min', 'max']).reset_index()
    price_grid = {}
    for _, r in price_ranges.iterrows():
        sku = r['SKU']
        min_p, max_p = r['min'], r['max']
        if pd.isna(min_p) or pd.isna(max_p):
            continue
        # Si min == max, np.linspace devuelve 1 valor repetido; está OK.
        price_grid[sku] = np.linspace(min_p, max_p, n_prices)
    return price_grid

In [78]:
price_grid = crear_price_grid(datos_unidos, n_prices=20)

  price_ranges = datos_unidos.groupby('SKU')['PRICE'].agg(['min', 'max']).reset_index()


In [None]:
def optimizacion_precios(template, model, price_grid, features, cant_precios, n_iter=1000,
                         target="GAIN", predict_func=None, save_dir=None):

    """
    Toma un Dataframe base (template) con todas las combinaciones de SKU X STORE_ID de los proximos dias y un modelo
    con su funcion predict, construye una posible configuracion de precios segun price_grid (un precio por cada SKU X STORE_ID),
    y busca la configuracion que maximiza target

    template: pandas Dataframe (con features y "COSTOS")
    model: modelo de prediccion 
    price_grid: diccionario con SKU como claves y rango de precios como valores (de la funcion crear_price_grid())
    features: lista de features que utiliza el modelo
    cant_precios: cantidad de precios que hay en cada rango
    n_iter: int
    target: "GAIN" si se maximiza la ganancia neta o "TOTAL_SALES" si se maximiza las ventas totales
    predict_func: en caso de que el modelo necesite preprocesamiento, esta la opcion de pasar una funcion especial (sino se utiliza model.predict())
    save_dir: directory para guardar los resultados
    """

    # Precalculos
    n_rows = template.shape[0]
    costos = template["COSTOS"].values
    sku_array = template["SKU"].values

    # Mapeamos precios posibles para cada SKU en forma de matriz
    precios_dict = {sku: np.array(price_grid[sku]) for sku in price_grid}

    precios_matrix = np.zeros((n_rows, cant_precios), dtype=float)
    sku_indices = np.zeros(n_rows, dtype=int)

    for i, sku in enumerate(sku_array):
        precios_sku = precios_dict[sku]
        precios_matrix[i, :len(precios_sku)] = precios_sku
        sku_indices[i] = len(precios_sku)

    # DataFrame base sin PRICE
    X_base = template.copy()

    mejor_gain = -np.inf
    mejor_sales = -np.inf
    mejor_config = None

    for n in range(n_iter):
        print(f"Iteracion: {n}")
        # Elección vectorizada de precios aleatorios
        idx_random = np.array([random.randrange(sku_indices[i]) for i in range(n_rows)])
        precios_asignados = precios_matrix[np.arange(n_rows), idx_random]

        # Predicción
        if predict_func is None:
            y_pred = model.predict(X_base.assign(PRICE=precios_asignados)[features])
        else:
            y_pred = predict_func(model, X_base.assign(PRICE=precios_asignados)[features])

        # Cálculo de ganancia
        gain = y_pred.sum() - ((y_pred / precios_asignados) * costos).sum()
        total_sales = y_pred.sum()

        # Guardar mejor configuración
        if target == "GAIN":
            if gain > mejor_gain:
                mejor_gain = gain
                mejor_sales = total_sales
                mejor_config = precios_asignados.copy()

        if target == "TOTAL_SALES":
            if total_sales > mejor_sales:
                mejor_gain = gain
                mejor_sales = total_sales
                mejor_config = precios_asignados.copy()

    # Guardar mejor configuración en memoria
    if save_dir:
        df_mejor = template.copy()
        df_mejor["PRICE"] = mejor_config
        df_mejor.to_csv(f"{save_dir}/mejor_config.csv", index=False)

    return mejor_sales, mejor_gain, mejor_config


In [None]:
max, mejor_config = optimizacion_precios(template=template, model=model, price_grid=price_grid,cant_precios=20, features=features, n_iter=1000, target="GAIN")