# Modeling

## Load Data

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
sys.path.append('../src')

from utils.metrics import calculate_metrics, print_metrics

# Cargar splits
data_dir = Path('../data/processed')

train_df = pd.read_csv(data_dir / 'train.csv', index_col='Datetime', parse_dates=True)
val_df = pd.read_csv(data_dir / 'val.csv', index_col='Datetime', parse_dates=True)
test_df = pd.read_csv(data_dir / 'test.csv', index_col='Datetime', parse_dates=True)

# Separar X e y
TARGET_COL = 'PJME_MW'
feature_cols = [col for col in train_df.columns if col != TARGET_COL]

X_train, y_train = train_df[feature_cols], train_df[TARGET_COL]
X_val, y_val = val_df[feature_cols], val_df[TARGET_COL]
X_test, y_test = test_df[feature_cols], test_df[TARGET_COL]

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}\n")

train_df.info()
# train_df.describe()
# train_df['PJME_MW'].describe()

X_train shape: (101637, 52)
X_val shape: (21779, 52)
X_test shape: (21781, 52)

<class 'pandas.DataFrame'>
DatetimeIndex: 101637 entries, 2002-01-08 02:00:00 to 2013-08-13 22:00:00
Data columns (total 53 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   PJME_MW                 101637 non-null  float64
 1   hour                    101637 non-null  int64  
 2   dayofweek               101637 non-null  int64  
 3   quarter                 101637 non-null  int64  
 4   month                   101637 non-null  int64  
 5   year                    101637 non-null  int64  
 6   dayofyear               101637 non-null  int64  
 7   weekofyear              101637 non-null  int64  
 8   is_weekend              101637 non-null  int64  
 9   is_month_start          101637 non-null  int64  
 10  is_month_end            101637 non-null  int64  
 11  hour_sin                101637 non-null  float64
 12  hour_cos                1

## Baseline Models

In [2]:
from modeling.baseline import LagNaive, RollingMeanNaive #, MovingAverageNaive
results = []

# Lag Naive last hour
print('Model: Lag Naive 1 hr') #valor de hace 1 hora
lag_naive = LagNaive(lag_column='lag_1')
lag_naive.fit(X_train, y_train)
val_pred = lag_naive.predict(X_val)
metrics = calculate_metrics(y_val, val_pred, 'LagNaive_1hr - Validation')
print_metrics(metrics)
results.append({'model': 'Lag_Naive_1hr', **metrics})

# Lag Naive last day
print('Model: Lag Naive 1 day') #valor de hace 1 día
lag_naive = LagNaive(lag_column='lag_24')
lag_naive.fit(X_train, y_train)
val_pred = lag_naive.predict(X_val)
metrics = calculate_metrics(y_val, val_pred, 'LagNaive_1day - Validation')
print_metrics(metrics)
results.append({'model': 'Lag_Naive_1day', **metrics})

# Lag Naive last week
print('Model: Lag Naive 1 week') #valor de hace 1 semana
lag_naive = LagNaive(lag_column='lag_168')
lag_naive.fit(X_train, y_train)
val_pred = lag_naive.predict(X_val)
metrics = calculate_metrics(y_val, val_pred, 'LagNaive_1week - Validation')
print_metrics(metrics)
results.append({'model': 'Lag_Naive_1week', **metrics})


# Rolling Mean 24h
print("\nModel: Rolling Mean 24h")
rm24 = RollingMeanNaive("rolling_mean_24")
rm24.fit(X_train, y_train)

val_pred_24 = rm24.predict(X_val)
metrics_24 = calculate_metrics(y_val, val_pred_24, "RollingMean_24 - Validation")
print_metrics(metrics_24)

results.append({'model': 'RollingMean_24', **metrics_24})

# Rolling Mean 168h
print("\nModel: Rolling Mean 168h")
rm168 = RollingMeanNaive("rolling_mean_168")
rm168.fit(X_train, y_train)

val_pred_168 = rm168.predict(X_val)
metrics_168 = calculate_metrics(y_val, val_pred_168, "RollingMean_168 - Validation")
print_metrics(metrics_168)

results.append({'model': 'RollingMean_168', **metrics_168})

# # Moving Average (promedio últimas 24 horas)
# print("\n Model: Moving Average 1 day")
# window_cols = [f"lag_{i}" for i in range(1, 25)]
# ma_naive = MovingAverageNaive(window_columns=window_cols)
# ma_naive.fit(X_train, y_train)
# val_pred = ma_naive.predict(X_val)
# metrics = calculate_metrics(y_val, val_pred, "Moving Avg 1 day - Validation")
# print_metrics(metrics)
# results.append({'model': 'Moving_Avg_1day', **metrics})

# # Moving Average últimas 168 horas = 1 semana
# print('\n Model: Moving Average 1 week')
# window_cols = [f'lag_{i}' for i in range(1,169)]
# ma_naive = MovingAverageNaive(window_columns=window_cols)
# ma_naive.fit(X_train, y_train)
# val_pred = ma_naive.predict(X_val)
# metrics = calculate_metrics(y_val, val_pred, 'Moving Avg 1 week - Validation')
# print_metrics(metrics)
# results.append({'model': 'Moving_Avg_1week', **metrics})

Model: Lag Naive 1 hr

Métricas - LagNaive_1hr - Validation
MAE:  1,064.21 MW
RMSE: 1,379.42 MW
MAPE: 3.44%
R²:   0.9502

Model: Lag Naive 1 day

Métricas - LagNaive_1day - Validation
MAE:  2,243.93 MW
RMSE: 3,065.84 MW
MAPE: 7.01%
R²:   0.7541

Model: Lag Naive 1 week

Métricas - LagNaive_1week - Validation
MAE:  3,184.74 MW
RMSE: 4,300.53 MW
MAPE: 9.86%
R²:   0.5162


Model: Rolling Mean 24h

Métricas - RollingMean_24 - Validation
MAE:  3,555.37 MW
RMSE: 4,404.84 MW
MAPE: 11.74%
R²:   0.4925


Model: Rolling Mean 168h

Métricas - RollingMean_168 - Validation
MAE:  4,109.94 MW
RMSE: 5,179.41 MW
MAPE: 13.51%
R²:   0.2983



## Traditional ML Models

In [3]:
from modeling.traditional_ml import train_random_forest, train_xgboost, train_lightgbm

# Random Forest
print('\n Model: Random Forest')
rf, rf_train_pred , rf_val_pred = train_random_forest(
    X_train, y_train, X_val, y_val,
    save_path='../models/random_forest.pkl'
)
metrics_rf = calculate_metrics(y_val, rf_val_pred, 'Random Forest - Validation')
print_metrics(metrics_rf)
results.append({'model': 'Random_Forest', **metrics_rf})

# XGBoost
print('\n Model: XGBoost')
xgb, xgb_train_pred, xgb_val_pred = train_xgboost(
    X_train, y_train, X_val, y_val,
    save_path='../models/xgboost.pkl'
)

metrics_xgb = calculate_metrics(y_val, xgb_val_pred, 'XGBoost - Validation')
print_metrics(metrics_xgb)
results.append({'model': 'XGBoost', **metrics_xgb})

# LightGBM
print('\n Model: LightGBM')
lgbm, lgbm_train_pred, lgbm_val_pred = train_lightgbm(
    X_train, y_train, X_val, y_val,
    save_path='../models/lightgbm.pkl'
)
metrics_lgbm = calculate_metrics(y_val, lgbm_val_pred, "LightGBM - Validation")
print_metrics(metrics_lgbm)
results.append({'model': 'LightGBM', **metrics_lgbm})


 Model: Random Forest
Entrenando Random Forest...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.4min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    1.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    1.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.2s finished


Modelo guardado en: ../models/random_forest.pkl

Métricas - Random Forest - Validation
MAE:  15.43 MW
RMSE: 26.58 MW
MAPE: 0.05%
R²:   1.0000


 Model: XGBoost
Entrenando XGBoost...
[0]	validation_0-rmse:5947.45452
[1]	validation_0-rmse:5656.43724
[2]	validation_0-rmse:5375.63968
[3]	validation_0-rmse:5108.64781
[4]	validation_0-rmse:4855.08355
[5]	validation_0-rmse:4619.88753
[6]	validation_0-rmse:4393.04985
[7]	validation_0-rmse:4175.28764
[8]	validation_0-rmse:3977.00670
[9]	validation_0-rmse:3780.23604
[10]	validation_0-rmse:3600.72194
[11]	validation_0-rmse:3426.88174
[12]	validation_0-rmse:3261.99061
[13]	validation_0-rmse:3100.74511
[14]	validation_0-rmse:2949.40833
[15]	validation_0-rmse:2808.48342
[16]	validation_0-rmse:2670.12693
[17]	validation_0-rmse:2538.62430
[18]	validation_0-rmse:2413.46866
[19]	validation_0-rmse:2294.73633
[20]	validation_0-rmse:2181.89725
[21]	validation_0-rmse:2074.68952
[22]	validation_0-rmse:1972.77248
[23]	validation_0-rmse:1876.09725
[24]	validat

In [5]:
# =====================
# Model Comparison
# =====================

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('RMSE')
print("\n" + "="*70)
print(" Ranking de Modelos (ordenado por RMSE)")
print("="*70)
print(results_df[['model', 'MAE', 'RMSE', 'MAPE', 'R2']].to_string(index=False))

# Guardar resultados
results_df.to_csv('../results/model_comparison.csv', index=False)
print("\n Resultados guardados en: ../results/model_comparison.csv")


 Ranking de Modelos (ordenado por RMSE)
          model         MAE        RMSE      MAPE       R2
  Random_Forest   15.427345   26.583666  0.049382 0.999982
        XGBoost   71.132281   95.636808  0.231548 0.999761
       LightGBM   82.624705  108.567527  0.267278 0.999692
  Lag_Naive_1hr 1064.210478 1379.421739  3.441808 0.950227
 Lag_Naive_1day 2243.931585 3065.843199  7.014682 0.754136
Lag_Naive_1week 3184.738280 4300.530747  9.857352 0.516229
 RollingMean_24 3555.367587 4404.842137 11.740535 0.492476
RollingMean_168 4109.944946 5179.412595 13.514516 0.298292

 Resultados guardados en: ../results/model_comparison.csv
