In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import sys
import os

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import ParameterGrid
from sklearn.base import BaseEstimator, TransformerMixin

sns.set(style="darkgrid")

In [2]:
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, variable):
        self.variable = variable

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variable:
            X[var] = pd.to_datetime(X[var])
            X[var + '_year'] = X[var].dt.year
            X[var + '_month'] = X[var].dt.month
            X[var + '_day'] = X[var].dt.day
            X[var + '_weekday'] = X[var].dt.dayofweek
            
            X.drop(columns=[var], inplace=True)
        return X

# Cargar Datos
df = pd.read_excel('../data/raw/data_sales_forecasting.xlsx', sheet_name='Base de Datos')

# Ordenar por fecha
df['Fecha'] = pd.to_datetime(df['Fecha'])
df.sort_values('Fecha', inplace=True)
df.reset_index(drop=True, inplace=True)

date_vars = ['Fecha']

cat_vars = [
    'Codigo_Cupon', 
    'Descripcion_Cupon', 
    'Codigo_Producto', 
    'Tipo_Orden', 
    'Tipo_Pago', 
    'Canal_Orden'
]

possible_nums = ['Cantidad_Vendida', 'Precio_Menu_GTQ', 'No_Tienda']
num_vars = [c for c in possible_nums if c in df.columns]

# Construir el Pipeline de Preprocesamiento
preprocessor = ColumnTransformer([
    ('date', Pipeline([('extractor', DateFeatureExtractor(date_vars))]), date_vars),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')), 
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), cat_vars),
    ('num', StandardScaler(), num_vars)
], remainder='drop') 

feature_pipeline = Pipeline([('preprocessor', preprocessor)])

print("Pipeline de preprocesamiento creado.")

Pipeline de preprocesamiento creado.


In [3]:
target = 'Venta_Neta_GTQ'
X = df.drop(columns=[target])
y = df[target]

split_point = int(len(df) * 0.80)

X_train = X.iloc[:split_point]
y_train = y.iloc[:split_point]
X_val = X.iloc[split_point:]
y_val = y.iloc[split_point:]

print(f"Train: {X_train.shape[0]} filas | Validación: {X_val.shape[0]} filas")

Train: 521097 filas | Validación: 130275 filas


In [4]:
print("Transformando datos...")
X_train_processed = feature_pipeline.fit_transform(X_train, y_train)
X_val_processed = feature_pipeline.transform(X_val)

print(f"Datos transformados. Nueva forma: {X_train_processed.shape}")

Transformando datos...




Datos transformados. Nueva forma: (521097, 220)


In [5]:
models_config = {
    'Ridge': {
        'model': Ridge(random_state=2025),
        'params': {'alpha': [0.1, 1.0, 10.0]}
    },
    'Lasso': {
        'model': Lasso(random_state=2025),
        'params': {'alpha': [0.01, 0.1, 1.0]}
    },
    'DecisionTree': {
        'model': DecisionTreeRegressor(random_state=2025),
        'params': {'max_depth': [5, 10, 20]}
    },
    'RandomForest': {
        'model': RandomForestRegressor(random_state=2025, n_jobs=-1),
        'params': {'n_estimators': [50, 100, 150]}
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(random_state=2025),
        'params': {'learning_rate': [0.05, 0.1, 0.2]}
    }
}

In [6]:
results = []
best_score = float('inf')
best_model_name = None
best_params = None
best_model_instance = None

print("Iniciando entrenamiento...")

for name, config in models_config.items():
    print(f"--> Modelo: {name}")
    grid = list(ParameterGrid(config['params']))
    
    for params in grid:
        model = config['model'].set_params(**params)
        model.fit(X_train_processed, y_train)
        
        preds = model.predict(X_val_processed)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        
        results.append({'Model': name, 'Params': str(params), 'RMSE': rmse})
        print(f"    {params} -> RMSE: {rmse:.2f}")
        
        if rmse < best_score:
            best_score = rmse
            best_model_name = name
            best_params = params
            best_model_instance = model

print(f"\nGANADOR: {best_model_name} con RMSE: {best_score:.2f}")

Iniciando entrenamiento...
--> Modelo: Ridge
    {'alpha': 0.1} -> RMSE: 15.03
    {'alpha': 1.0} -> RMSE: 15.03
    {'alpha': 10.0} -> RMSE: 15.03
--> Modelo: Lasso
    {'alpha': 0.01} -> RMSE: 15.21
    {'alpha': 0.1} -> RMSE: 16.62
    {'alpha': 1.0} -> RMSE: 20.83
--> Modelo: DecisionTree
    {'max_depth': 5} -> RMSE: 11.52
    {'max_depth': 10} -> RMSE: 7.50
    {'max_depth': 20} -> RMSE: 6.75
--> Modelo: RandomForest
    {'n_estimators': 50} -> RMSE: 5.49
    {'n_estimators': 100} -> RMSE: 5.47
    {'n_estimators': 150} -> RMSE: 5.46
--> Modelo: GradientBoosting
    {'learning_rate': 0.05} -> RMSE: 9.72
    {'learning_rate': 0.1} -> RMSE: 7.42
    {'learning_rate': 0.2} -> RMSE: 6.05

GANADOR: RandomForest con RMSE: 5.46


In [7]:
final_pipeline = Pipeline([
    ('preprocessor', feature_pipeline),
    ('model', best_model_instance)
])

output_path = '../models/sales_forecasting_pipeline_v1.pkl'
joblib.dump(final_pipeline, output_path)
print(f"Pipeline guardado en: {output_path}")

Pipeline guardado en: ../models/sales_forecasting_pipeline_v1.pkl
