In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import warnings

warnings.filterwarnings('ignore')

def calculate_wape(y_true, y_pred):
    """Calcula el Weighted Average Percentage Error (WAPE)."""
    return np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true)) * 100

class EnergyConsumptionPredictorXGB:
    def __init__(self):
        self.model = None
        self.company_encoder = LabelEncoder()
        self.feature_importance = None
        self.company_stats_map = None # Almacenará las estadísticas del training set

    # --- LAS FUNCIONES DE FEATURE ENGINEERING (lag, rolling, seasonal) PERMANECEN IGUAL ---
    # (Se omiten por brevedad, son las mismas que proporcionaste y están bien diseñadas)
    def create_lag_features(self, df, target_col, lags):
        df_lag = df.copy()
        for lag in lags:
            df_lag[f'{target_col}_lag_{lag}'] = df_lag.groupby('IdEmpresa')[target_col].shift(lag)
        return df_lag

    def create_rolling_features(self, df, target_col, windows):
        df_roll = df.copy()
        for window in windows:
            df_roll[f'{target_col}_rolling_mean_{window}'] = df_roll.groupby('IdEmpresa')[target_col].transform(lambda x: x.shift(1).rolling(window=window, min_periods=1).mean())
            df_roll[f'{target_col}_rolling_std_{window}'] = df_roll.groupby('IdEmpresa')[target_col].transform(lambda x: x.shift(1).rolling(window=window, min_periods=1).std())
        return df_roll
        
    def create_seasonal_features(self, df):
        df_seasonal = df.copy()
        df_seasonal['mes_sin'] = np.sin(2 * np.pi * df_seasonal['IdMes'] / 12)
        df_seasonal['mes_cos'] = np.cos(2 * np.pi * df_seasonal['IdMes'] / 12)
        return df_seasonal

    # --- CORRECCIÓN CRÍTICA DE FUGA DE DATOS ---
    def fit_company_features(self, df_train):
        """
        Aprende las estadísticas de las empresas SOLO del conjunto de entrenamiento.
        """
        print("Aprendiendo estadísticas de empresas del set de entrenamiento...")
        company_stats = df_train.groupby('IdEmpresa')['Energía Facturada (MWh)'].agg(
            mean='mean', std='std', max='max'
        ).reset_index()
        company_stats.columns = ['IdEmpresa', 'empresa_mean_historica', 'empresa_std_historica', 'empresa_max_historica']
        self.company_stats_map = company_stats.set_index('IdEmpresa')

    def transform_company_features(self, df):
        """
        Aplica las estadísticas aprendidas a cualquier conjunto de datos (train, val, test).
        """
        if self.company_stats_map is None:
            raise RuntimeError("Debes llamar a 'fit_company_features' primero.")
        
        df_company = df.merge(self.company_stats_map, on='IdEmpresa', how='left')
        
        # Llenar NaNs para empresas que no estaban en el training set (si aplica)
        for col in ['empresa_mean_historica', 'empresa_std_historica', 'empresa_max_historica']:
             df_company[col].fillna(self.company_stats_map[col].mean(), inplace=True)

        df_company['IdEmpresa_encoded'] = self.company_encoder.fit_transform(df_company['IdEmpresa'])
        return df_company

    def feature_engineering(self, df):
        print("Iniciando Feature Engineering...")
        df = df.sort_values(['IdEmpresa', 'Año', 'IdMes']).reset_index(drop=True)

        lag_periods = [1, 2, 3, 6, 12]
        df = self.create_lag_features(df, 'Energía Facturada (MWh)', lag_periods)

        rolling_windows = [3, 6, 12]
        df = self.create_rolling_features(df, 'Energía Facturada (MWh)', rolling_windows)
        
        df = self.create_seasonal_features(df)
        df['tendencia_temporal'] = (df['Año'] - df['Año'].min()) * 12 + df['IdMes']
        
        # Las features de empresa se aplican después de la división de datos
        return df

    def temporal_split(self, df, train_end_year, val_end_year):
        """División temporal estricta en train, validation y test."""
        print(f"Dividiendo datos: Train <= {train_end_year}, Val <= {val_end_year}, Test > {val_end_year}")
        
        train_df = df[df['Año'] <= train_end_year].copy()
        val_df = df[(df['Año'] > train_end_year) & (df['Año'] <= val_end_year)].copy()
        test_df = df[df['Año'] > val_end_year].copy()

        print(f"Train: {len(train_df)} muestras ({train_df['Año'].min()}-{train_df['Año'].max()})")
        print(f"Validation: {len(val_df)} muestras ({val_df['Año'].min()}-{val_df['Año'].max()})")
        print(f"Test: {len(test_df)} muestras ({test_df['Año'].min()}-{test_df['Año'].max()})")
        
        return train_df, val_df, test_df

    def train(self, df_train, df_val):
        """Entrenar modelo XGBoost con validación para early stopping."""
        print("Preparando datos para entrenamiento...")
        
        # 1. Aprender y transformar features de empresa EN EL SET DE ENTRENAMIENTO
        self.fit_company_features(df_train)
        train_processed = self.transform_company_features(df_train)
        
        # 2. Aplicar las mismas transformaciones al set de validación
        val_processed = self.transform_company_features(df_val)
        
        # Eliminar filas con NaNs generados por lags/rolling
        train_processed.dropna(subset=['Energía Facturada (MWh)'], inplace=True)
        val_processed.dropna(subset=['Energía Facturada (MWh)'], inplace=True)
        
        y_train = train_processed['Energía Facturada (MWh)']
        y_val = val_processed['Energía Facturada (MWh)']

        exclude_cols = ['IdEmpresa', 'Energía Facturada (MWh)', 'Año']
        feature_cols = [col for col in train_processed.columns if col not in exclude_cols]
        
        X_train = train_processed[feature_cols].fillna(0) # Rellenar cualquier NaN restante
        X_val = val_processed[feature_cols].fillna(0)

        print(f"Entrenando con {len(X_train.columns)} features.")
        
        params = {
            'n_estimators': 1000, # Aumentado, early stopping encontrará el mejor
            'learning_rate': 0.03,
            'max_depth': 7,
            'min_child_weight': 1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'n_jobs': -1,
            'eval_metric': 'rmse',
            'early_stopping_rounds': 50, # Para early stopping
        }
        self.model = xgb.XGBRegressor(**params)

        # CORRECTO: Usar el set de validación para early stopping
        eval_set = [(X_train, y_train), (X_val, y_val)]
        self.model.fit(
            X_train, y_train,
            eval_set=eval_set,
          #  eval_metric='rmse',
           # early_stopping_rounds=50, # Detener si el RMSE de validación no mejora en 50 rondas
            verbose=False
        )
        
        self.feature_importance = pd.DataFrame({
            'feature': X_train.columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        print("Entrenamiento completado.")
    
    def evaluate(self, df_test):
        """Evaluar modelo en el conjunto de prueba."""
        print("Evaluando en el conjunto de prueba...")
        
        # Aplicar las transformaciones aprendidas
        test_processed = self.transform_company_features(df_test)
        
        y_test = test_processed['Energía Facturada (MWh)']
        exclude_cols = ['IdEmpresa', 'Energía Facturada (MWh)', 'Año']
        feature_cols = [col for col in test_processed.columns if col not in exclude_cols]
        X_test = test_processed[feature_cols].fillna(0)
        
        y_pred = self.model.predict(X_test)

        # Métricas Globales
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        wape = calculate_wape(y_test, y_pred)
        
        print("\n=== MÉTRICAS GENERALES (TEST SET) ===")
        print(f"WAPE: {wape:.2f}%")
        print(f"MAE: {mae:.2f} MWh")
        print(f"RMSE: {rmse:.2f} MWh")
        print(f"R²: {r2:.4f}")
        
        # Métricas por Empresa
        results_df = pd.DataFrame({
            'IdEmpresa': test_processed['IdEmpresa'],
            'y_true': y_test,
            'y_pred': y_pred
        })

        company_metrics_list = []
        for empresa_id in results_df['IdEmpresa'].unique():
            subset = results_df[results_df['IdEmpresa'] == empresa_id]
            mae_emp = mean_absolute_error(subset['y_true'], subset['y_pred'])
            mape_emp = np.mean(np.abs((subset['y_true'] - subset['y_pred']) / subset['y_true'])) * 100
            r2_emp = r2_score(subset['y_true'], subset['y_pred'])
            company_metrics_list.append({
                'IdEmpresa': empresa_id,
                'mae_empresa': mae_emp,
                'mape_empresa': mape_emp,
                'r2_empresa': r2_emp,
                'mean_consumption': subset['y_true'].mean()
            })
        
        company_metrics = pd.DataFrame(company_metrics_list).set_index('IdEmpresa')
        print("\n=== MÉTRICAS POR EMPRESA (TEST SET) ===")
        print(company_metrics.sort_values('mape_empresa'))

# --- Pipeline de Ejecución ---
def run_xgb_pipeline(df):
    predictor = EnergyConsumptionPredictorXGB()
    
    # 1. Feature Engineering general (sin leakage)
    df_featured = predictor.feature_engineering(df)
    
    # 2. División temporal estricta
    train_df, val_df, test_df = predictor.temporal_split(df_featured, train_end_year=2016, val_end_year=2017)
    
    # 3. Entrenamiento (aprende de train, valida con val)
    predictor.train(train_df, val_df)
    
    # 4. Evaluación final (en datos nunca vistos)
    predictor.evaluate(test_df)
    
    return predictor

# Uso:
df = pd.read_csv('df_dataset_unidos5.csv')
xgb_predictor = run_xgb_pipeline(df)

Iniciando Feature Engineering...
Dividiendo datos: Train <= 2016, Val <= 2017, Test > 2017
Train: 1805 muestras (2009-2016)
Validation: 228 muestras (2017-2017)
Test: 456 muestras (2018-2019)
Preparando datos para entrenamiento...
Aprendiendo estadísticas de empresas del set de entrenamiento...
Entrenando con 24 features.
Entrenamiento completado.
Evaluando en el conjunto de prueba...

=== MÉTRICAS GENERALES (TEST SET) ===
WAPE: 5.51%
MAE: 5173.96 MWh
RMSE: 9631.38 MWh
R²: 0.9921

=== MÉTRICAS POR EMPRESA (TEST SET) ===
            mae_empresa  mape_empresa  r2_empresa  mean_consumption
IdEmpresa                                                          
11          1153.497634      2.080807    0.025145      55445.964134
30          2073.557282      2.282737   -0.021840      90753.610635
142         1109.783223      2.349026   -0.006403      48240.730685
191         1409.922541      2.465282    0.314462      57414.182110
174         9019.203312      2.493576   -1.282100     360881.94539