## Prediccion de demanda semanal

In [30]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import lightgbm as lgb

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error

import optuna

In [31]:
df = pd.read_csv('../data_clean/data_modelado.csv')
df[["Fecha", "Region", "ID_Region", "Cantidad_Semanal"]]

Unnamed: 0,Fecha,Region,ID_Region,Cantidad_Semanal
0,2024-02-05,Buenos Aires,0,43
1,2024-02-08,Buenos Aires,0,43
2,2024-02-11,Buenos Aires,0,43
3,2024-02-11,Buenos Aires,0,43
4,2024-02-11,Buenos Aires,0,43
...,...,...,...,...
2134,2024-12-03,Patagonia,5,29
2135,2024-12-12,Patagonia,5,19
2136,2024-12-16,Patagonia,5,21
2137,2024-12-21,Patagonia,5,21


In [34]:
# Time Series Split
tscv = TimeSeriesSplit(n_splits=5)

df = df.select_dtypes(include=np.number)

# LGBM usa todas las regiones menos NEA y NOA
df_lgbm = df[~df['ID_Region'].isin([3, 4])]

X_lgbm = df_lgbm.drop(columns=['Cantidad', 'Cantidad_Semanal', 'Monto_Venta'])
y_lgbm = df_lgbm['Cantidad_Semanal']


# Media movil para NEA y NOA
df_ma = df[df['ID_Region'].isin([3, 4])]

X_ma = df_ma.drop(columns=['Cantidad', 'Cantidad_Semanal', 'Monto_Venta'])
y_ma = df_ma['Cantidad_Semanal']

In [35]:
# Hyperparameter tuning con Optuna para LGBM con TSCV
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'verbosity': -1,  # Suppress warnings
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 256),  # Reduced max
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 50),  # Adjusted range
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'seed': 42,
    }
    
    rmse_scores = []

    for fold, (train_index, val_index) in enumerate(tscv.split(X_lgbm)):
        X_train, X_val = X_lgbm.iloc[train_index], X_lgbm.iloc[val_index]
        y_train, y_val = y_lgbm.iloc[train_index], y_lgbm.iloc[val_index]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
        
        gbm = lgb.train(
            params,
            lgb_train,
            num_boost_round=1000,
            valid_sets=[lgb_val],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50, verbose=False),
                lgb.log_evaluation(period=0)
            ]
        )
        
        y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        rmse = np.sqrt(np.mean((y_val - y_pred) ** 2))
        
        # Report for pruning
        trial.report(rmse, fold)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        rmse_scores.append(rmse)
    
    return float(np.mean(rmse_scores))

In [36]:
# Optimización de hiperparámetros con Optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = optuna.samplers.TPESampler(seed=42)
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=0)

study = optuna.create_study(direction='minimize', sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Mejores hiperparámetros: ", study.best_params)
print("Mejor RMSE: ", study.best_value)

Best trial: 17. Best value: 11.3527: 100%|██████████| 50/50 [01:39<00:00,  1.99s/it]

Mejores hiperparámetros:  {'learning_rate': 0.020655958307775498, 'num_leaves': 123, 'max_depth': 11, 'min_data_in_leaf': 10, 'feature_fraction': 0.7390907986673838, 'bagging_fraction': 0.8604035330355403, 'bagging_freq': 6, 'lambda_l1': 0.05998558505792558, 'lambda_l2': 1.4534088606555146e-05}
Mejor RMSE:  11.352652577831817





In [None]:
# n optima para media movil en NEA y NOA

# Las ventas semanales en la region seran el promeido de las n semanas anteriores

n=range(1,13)

df_ma["Pred_MA"] = np.nan

def evaluate_ma(n):
    # Empezamos en la semana n por region
    idx = df_ma[df_ma['semana'] == n].index

    # Calculamos la media movil para cada region
    df_ma['Pred_MA'] = df_ma.groupby('ID_Region')['Cantidad_Semanal'].transform(lambda x: x.shift(1).rolling(window=n, min_periods=1).mean())

    # Evaluación del modelo de media movil
    rmse_list = []
    mae_list = []
    max_errs_list = []

    y_true_list = []
    y_pred_list = []

    for fi, (train_idx, test_idx) in enumerate(tscv.split(X), 1):
        y_true = df.loc[test_idx, 'Cantidad_Semanal']
        y_pred = df.loc[test_idx, 'Pred_MA']

        y_true_list.append(y_true)
        y_pred_list.append(y_pred)

        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        

        rmse_list.append(rmse)
        mae = mean_absolute_error(y_true, y_pred)
        mae_list.append(mae)
        max_err = np.max(np.abs(y_true - y_pred))
        max_errs_list.append(max_err)

    return {
        'n': n,
        'RMSE': np.mean(rmse_list),
        'MAE': np.mean(mae_list),
        'Max_Error': np.mean(max_errs_list)
    }

ma_results = [evaluate_ma(i) for i in n]
ma_results_df = pd.DataFrame(ma_results)
best_n = ma_results_df.loc[ma_results_df['RMSE'].idxmin(), 'n']
print(f"Mejor n para media movil: {best_n}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ma['Pred_MA'] = df_ma.groupby('ID_Region')['Cantidad_Semanal'].transform(lambda x: x.shift(1).rolling(window=n, min_periods=1).mean())


KeyError: 'Pred_MA'

In [None]:
# 