## Setup

In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score
from datetime import timedelta

In [2]:
datos_unidos = pd.read_csv("../data/procesados/datos_unidos.csv")

In [3]:
datos_unidos.columns

Index(['SKU', 'DATE', 'STORE_ID', 'PRICE', 'QUANTITY', 'TOTAL_SALES', 'REGION',
       'CITY', 'STATE', 'STORE_TYPE', 'OPENDATE', 'CLOSEDATE',
       'STORE_SUBGROUP_DATE_ID', 'CATEGORY', 'GROUP', 'SUBGROUP', 'GROUP_TYPE',
       'PRICE_GROUP_ID', 'BRAND', 'INITIAL_TICKET_PRICE', 'BASE_PRICE',
       'COSTOS', 'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'WEEK', 'YEAR_OPEN',
       'YEAR_CLOSE', 'MONTH_OPEN', 'MONTH_CLOSE'],
      dtype='object')

In [4]:
cols_categoricas = ['SKU', 'STORE_ID', 'REGION',
       'CITY', 'STATE', 'STORE_TYPE',  'CATEGORY', 'GROUP', 'SUBGROUP', 'GROUP_TYPE',
       'PRICE_GROUP_ID', 'BRAND',
       'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'WEEK',
       'YEAR_OPEN', 'YEAR_CLOSE', 'MONTH_OPEN', 'MONTH_CLOSE']

cols_numericas = ['PRICE', 'SKU_mean_7D',
       'SKU_std_7D', 'SKU_mean_30D', 'SKU_std_30D', 'SKU_mean_90D',
       'SKU_std_90D', 'SKU_mean_180D', 'SKU_std_180D', 'STORE_ID_mean_7D',
       'STORE_ID_std_7D', 'STORE_ID_mean_30D', 'STORE_ID_std_30D',
       'STORE_ID_mean_90D', 'STORE_ID_std_90D', 'STORE_ID_mean_180D',
       'STORE_ID_std_180D']

features = cols_categoricas + cols_numericas 
target = "TOTAL_SALES"

In [5]:
datos_unidos.drop(columns=["OPENDATE", "CLOSEDATE", "QUANTITY", "STORE_SUBGROUP_DATE_ID"], inplace=True)

In [6]:
datos_unidos.drop(columns=['INITIAL_TICKET_PRICE', 'BASE_PRICE', "COSTOS"], inplace=True)

In [7]:
for col in cols_categoricas:
    datos_unidos[col] = datos_unidos[col].astype("category")

## Feature aggregation

Agregamos una rolling window de las ventas totales de cada producto y de cada tienda

In [None]:
def rolling_features(df, group_col, windows = [7,30, 90, 180]):
    df.sort_values(by=['DATE', "SKU", "STORE_ID"])
    
    for window in windows:
        df[f"{group_col}_mean_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
            .rolling(window=window, min_periods=1).mean().reset_index(level=[0,1], drop=True)
        
        df[f"{group_col}_std_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
            .rolling(window=window, min_periods=1).std().reset_index(level=[0,1], drop=True)

    return df

In [9]:
rolling_features(datos_unidos, group_col="SKU")

  df[f"{group_col}_mean_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_std_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_mean_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_std_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_mean_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_std_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_mean_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_std_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \


Unnamed: 0,SKU,DATE,STORE_ID,PRICE,TOTAL_SALES,REGION,CITY,STATE,STORE_TYPE,CATEGORY,...,MONTH_OPEN,MONTH_CLOSE,SKU_mean_7D,SKU_std_7D,SKU_mean_30D,SKU_std_30D,SKU_mean_90D,SKU_std_90D,SKU_mean_180D,SKU_std_180D
0,BEAHASH001,2021-01-01,S00068,35.53,355.30,West,Lakewood,CO,Express,Beauty,...,11,8,,,,,,,,
1,BEAHASH001,2021-01-01,S00086,33.52,67.04,Southeast,Raleigh,NC,Express,Beauty,...,12,8,211.170000,203.830601,211.170000,203.830601,211.170000,203.830601,211.170000,203.830601
2,BEAHASH001,2021-01-01,S00124,37.61,75.22,Northeast,Erie,PA,Outlet,Beauty,...,5,8,165.853333,164.116598,165.853333,164.116598,165.853333,164.116598,165.853333,164.116598
3,BEAHASH001,2021-01-01,S00140,34.51,138.04,Southeast,Greenville,SC,Mall,Beauty,...,5,8,158.900000,134.720330,158.900000,134.720330,158.900000,134.720330,158.900000,134.720330
4,BEAHASH001,2021-01-02,S00013,33.77,33.77,Southwest,El Paso,TX,Mall,Beauty,...,10,8,133.874000,129.397368,133.874000,129.397368,133.874000,129.397368,133.874000,129.397368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6832777,TOYGAPU006,2023-12-31,S00104,25.96,77.89,Midwest,Toledo,OH,Street,Toys,...,11,11,69.464286,33.823442,71.270000,29.866924,90.673444,42.103436,103.919000,64.134093
6832778,TOYGAPU006,2023-12-31,S00110,25.01,125.05,Midwest,Detroit,MI,Street,Toys,...,6,8,76.468571,39.932287,71.929000,30.845298,90.897889,42.233498,103.759722,64.045261
6832779,TOYGAPU006,2023-12-31,S00138,24.74,49.48,Southeast,Orlando,FL,Street,Toys,...,1,8,76.637143,39.795815,71.156333,31.115499,90.928111,42.202520,103.246500,64.108350
6832780,TOYGAPU006,2023-12-31,S00154,26.21,52.42,Northeast,Newark,NJ,Outlet,Toys,...,1,8,80.405714,35.188126,71.268333,31.039174,90.355889,42.373182,101.685278,61.853321


In [10]:
rolling_features(datos_unidos, group_col="STORE_ID")

  df[f"{group_col}_mean_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_std_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_mean_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_std_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_mean_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_std_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_mean_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \
  df[f"{group_col}_std_{window}D"] = df.groupby(group_col)["TOTAL_SALES"] \


Unnamed: 0,SKU,DATE,STORE_ID,PRICE,TOTAL_SALES,REGION,CITY,STATE,STORE_TYPE,CATEGORY,...,SKU_mean_180D,SKU_std_180D,STORE_ID_mean_7D,STORE_ID_std_7D,STORE_ID_mean_30D,STORE_ID_std_30D,STORE_ID_mean_90D,STORE_ID_std_90D,STORE_ID_mean_180D,STORE_ID_std_180D
0,BEAHASH001,2021-01-01,S00068,35.53,355.30,West,Lakewood,CO,Express,Beauty,...,,,,,,,,,,
1,BEAHASH001,2021-01-01,S00086,33.52,67.04,Southeast,Raleigh,NC,Express,Beauty,...,211.170000,203.830601,189.555000,71.679414,189.555000,71.679414,189.555000,71.679414,189.555000,71.679414
2,BEAHASH001,2021-01-01,S00124,37.61,75.22,Northeast,Erie,PA,Outlet,Beauty,...,165.853333,164.116598,149.630000,85.737892,149.630000,85.737892,149.630000,85.737892,149.630000,85.737892
3,BEAHASH001,2021-01-01,S00140,34.51,138.04,Southeast,Greenville,SC,Mall,Beauty,...,158.900000,134.720330,139.822500,72.700795,139.822500,72.700795,139.822500,72.700795,139.822500,72.700795
4,BEAHASH001,2021-01-02,S00013,33.77,33.77,Southwest,El Paso,TX,Mall,Beauty,...,133.874000,129.397368,118.648000,78.777227,118.648000,78.777227,118.648000,78.777227,118.648000,78.777227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6832777,TOYGAPU006,2023-12-31,S00104,25.96,77.89,Midwest,Toledo,OH,Street,Toys,...,103.919000,64.134093,90.931429,34.613600,119.272333,60.279373,121.748333,72.415563,126.126222,76.247925
6832778,TOYGAPU006,2023-12-31,S00110,25.01,125.05,Midwest,Detroit,MI,Street,Toys,...,103.759722,64.045261,97.771429,36.122720,120.269333,60.112391,122.425000,72.154704,126.618667,75.950464
6832779,TOYGAPU006,2023-12-31,S00138,24.74,49.48,Southeast,Orlando,FL,Street,Toys,...,103.246500,64.108350,95.900000,35.893869,118.346667,59.993423,122.107111,72.216609,126.552389,75.973767
6832780,TOYGAPU006,2023-12-31,S00154,26.21,52.42,Northeast,Newark,NJ,Outlet,Toys,...,101.685278,61.853321,84.830000,44.341279,114.390667,62.114811,121.308111,72.888943,126.070389,76.340047


In [69]:
datos_unidos.bfill(inplace=True)

## Test de modelos

### Walk-forward

In [15]:
def walk_forward_forecast(df, model, features, target, train_days=365, step_days=30, forecast_days=7):
    """
    Realiza un walk-forward, entrenando el modelo con una expanding window y prediciendo los 
    proximos dias

    df: DataFrame 
    model: modelo sklearn
    train_days: tamaño inicial del set de entrenamiento en días
    step_days: cuántos días se suman en cada iteración
    forecast_days: horizonte de predicción en días
    """
    # Aseguramos orden por fecha
    df["DATE"] = pd.to_datetime(df["DATE"])
    df = df.sort_values("DATE")
    
    results = []
    min_date = df["DATE"].min()
    max_date = df["DATE"].max()

    start_train_end = min_date + timedelta(days=train_days)

    while start_train_end + timedelta(days=forecast_days) <= max_date:
        # Definir ventanas
        train_data = df[df["DATE"] < start_train_end]
        test_data = df[(df["DATE"] >= start_train_end) &
                       (df["DATE"] < start_train_end + timedelta(days=forecast_days))]

        if len(test_data) == 0:
            break

        # Features y target
        X_train = train_data[features]
        y_train = train_data[target]
        X_test = test_data[features]
        y_test = test_data[target]

        # Entrenar
        model.fit(X_train, y_train)

        # Predicciones y métricas
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        r2_train = r2_score(y_train, y_train_pred)
        r2_test = r2_score(y_test, y_test_pred)

        results.append({
            "train_end_date": start_train_end,
            "r2_train": r2_train,
            "r2_test": r2_test
        })

        # Avanzar ventana
        start_train_end += timedelta(days=step_days)

    return pd.DataFrame(results)

### SVM

In [16]:
from sklearn.svm import SVR

In [19]:
# preprocesamiento
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cols_categoricas),
    ("num", StandardScaler(), cols_numericas)
    ])

data_encoded = preprocessor.fit_transform(datos_unidos)

In [20]:
data_encoded.shape

(6832782, 2181)

In [21]:
del datos_unidos

In [22]:
model = SVR(
    kernel='rbf',
    C=10,
    gamma='scale',  
    epsilon=0.1
)

In [25]:
results_svr = walk_forward_forecast(data_encoded, model=model, features=features, target=target)

IndexError: Index dimension must be 1 or 2

### LightGBM

In [71]:
datos_unidos

Unnamed: 0,SKU,DATE,STORE_ID,PRICE,TOTAL_SALES,REGION,CITY,STATE,STORE_TYPE,CATEGORY,...,SKU_mean_180D,SKU_std_180D,STORE_ID_mean_7D,STORE_ID_std_7D,STORE_ID_mean_30D,STORE_ID_std_30D,STORE_ID_mean_90D,STORE_ID_std_90D,STORE_ID_mean_180D,STORE_ID_std_180D
0,BEAHASH001,2021-01-01,S00068,35.53,355.30,West,Lakewood,CO,Express,Beauty,...,355.300000,203.830601,138.870000,71.679414,138.870000,71.679414,138.870000,71.679414,138.870000,71.679414
1,BEAHASH001,2021-01-01,S00086,33.52,67.04,Southeast,Raleigh,NC,Express,Beauty,...,211.170000,203.830601,189.555000,71.679414,189.555000,71.679414,189.555000,71.679414,189.555000,71.679414
2,BEAHASH001,2021-01-01,S00124,37.61,75.22,Northeast,Erie,PA,Outlet,Beauty,...,165.853333,164.116598,149.630000,85.737892,149.630000,85.737892,149.630000,85.737892,149.630000,85.737892
3,BEAHASH001,2021-01-01,S00140,34.51,138.04,Southeast,Greenville,SC,Mall,Beauty,...,158.900000,134.720330,139.822500,72.700795,139.822500,72.700795,139.822500,72.700795,139.822500,72.700795
4,BEAHASH001,2021-01-02,S00013,33.77,33.77,Southwest,El Paso,TX,Mall,Beauty,...,133.874000,129.397368,118.648000,78.777227,118.648000,78.777227,118.648000,78.777227,118.648000,78.777227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6832777,TOYGAPU006,2023-12-31,S00104,25.96,77.89,Midwest,Toledo,OH,Street,Toys,...,103.919000,64.134093,90.931429,34.613600,119.272333,60.279373,121.748333,72.415563,126.126222,76.247925
6832778,TOYGAPU006,2023-12-31,S00110,25.01,125.05,Midwest,Detroit,MI,Street,Toys,...,103.759722,64.045261,97.771429,36.122720,120.269333,60.112391,122.425000,72.154704,126.618667,75.950464
6832779,TOYGAPU006,2023-12-31,S00138,24.74,49.48,Southeast,Orlando,FL,Street,Toys,...,103.246500,64.108350,95.900000,35.893869,118.346667,59.993423,122.107111,72.216609,126.552389,75.973767
6832780,TOYGAPU006,2023-12-31,S00154,26.21,52.42,Northeast,Newark,NJ,Outlet,Toys,...,101.685278,61.853321,84.830000,44.341279,114.390667,62.114811,121.308111,72.888943,126.070389,76.340047


In [15]:
import lightgbm as lgb

In [None]:
def walk_forward_lightgbm(df, features, target_col, date_col, categorical_cols,
                          train_days=365, step_days=30, forecast_days=7,
                          params=None):
    """
    df: DataFrame con features + target
    target_col: nombre de la columna objetivo (ej. 'TOTAL_SALES')
    date_col: columna con la fecha
    categorical_cols: lista de columnas categóricas (deben ser dtype 'category')
    train_days, step_days, forecast_days: enteros en días
    params: dict de parámetros LightGBM
    num_boost_round: número de iteraciones de boosting
    """

    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(date_col)

    results = []
    min_date = df[date_col].min()
    max_date = df[date_col].max()
    start_train_end = min_date + timedelta(days=train_days)

    count = 0

    while start_train_end + timedelta(days=forecast_days) <= max_date:
        count+=1

        print(f"Walk-forward: iteracion numero {count}")

        # Train y Test
        train_data = df[df[date_col] < start_train_end]
        test_data = df[(df[date_col] >= start_train_end) &
                       (df[date_col] < start_train_end + timedelta(days=forecast_days))]

        if len(test_data) == 0:
            break

        # Creamos un validation set para early stopping
        valid_days_inner = 7
        train_end_inner = train_data["DATE"].max() - timedelta(days=valid_days_inner)

        train_inner = train_data[train_data["DATE"] <= train_end_inner]
        valid_inner = train_data[train_data["DATE"] > train_end_inner]

        X_train_inner = train_inner[features]
        y_train_inner = train_inner[target_col]
        X_valid_inner = valid_inner[features]
        y_valid_inner = valid_inner[target_col]

        # Dataset LightGBM
        lgb_train = lgb.Dataset(X_train_inner, label=y_train_inner, categorical_feature=categorical_cols)
        lgb_valid = lgb.Dataset(X_valid_inner, label=y_valid_inner, categorical_feature=categorical_cols, reference=lgb_train)

                
        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_valid],
            valid_names=["train_inner", "valid_inner"]
        )

        # Predicciones
        y_test_pred = model.predict(test_data[features], num_iteration=model.best_iteration)
        y_train_pred = model.predict(X_train_inner,num_iteration=model.best_iteration)
        

        # Métricas
        r2_test = r2_score(test_data[target], y_test_pred)
        r2_train = r2_score(y_train_inner, y_train_pred)


        results.append({
            "train_end_date": start_train_end,
            "r2_train": r2_train,
            "r2_test": r2_test
        })

        start_train_end += timedelta(days=step_days)

    return pd.DataFrame(results)


In [None]:
results_lgb = walk_forward_lightgbm(
    df=datos_unidos,
    features=features,
    target_col="TOTAL_SALES",
    date_col="DATE",
    categorical_cols=cols_categoricas,
    train_days=365,
    step_days=30,
    forecast_days=7,
    params={
        "objective": "regression",
        "metric": "rmse",
        "verbosity": 2,
        "learning_rate": 0.01,
        "num_leaves": 800,
        "max_depth": 20,
        "min_data_in_leaf": 50,
        "feature_fraction": 1,
        "bagging_fraction": 1,
        "bagging_freq": 0,
        "early_stopping_round": 20,
        "num_boost_round":1000
    }
)

Walk-forward: iteracion numero 1
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0,947762
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0,116031
[LightGBM] [Debug] init for col-wise cost 0,042340 seconds, init for row-wise cost 0,494722 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0,649151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6987
[LightGBM] [Info] Number of data points in the train set: 2240469, number of used features: 40
[LightGBM] [Info] Start training from score 298,280976
[LightGBM] [Debug] Trained a tree with leaves = 1000 and depth = 25
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Trained a tree with leaves = 1000 and depth = 25
[LightGBM] [Debug] Trained a tree with leaves = 1000 and depth = 25
[LightGBM] [Debug] Trained a tree with leaves = 1000 and depth = 25
[LightGBM] [Debug] Trained a tr

In [63]:
results_lgb.to_csv("resultados_test/resultados_lgb2.csv")

In [64]:
results_lgb.mean()

train_end_date    2022-12-27 00:00:00
r2_train                     0.845095
r2_test                      0.761484
dtype: object

## Deploy