In [1]:
import sys
import os
sys.path.append(os.path.abspath('..'))
from models import (
    PersistenceModel,
    LinearRegressionModel,
    RandomForestModel,
    XGBoostModel,
)

from transform_pipeline import (
    AsymmetricPostHocRegressor,
    asymmetric_mse_loss,
    eval_asymmetric_loss
)
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from data.data_helpers import get_dataframe



In [2]:
df = get_dataframe()
df = df.dropna(subset=["target_next_hour"])

# Temporal split: 2018–2019 → train/val, 2020 → test
train_df = df[df["datetime"] < "2020-01-01"].copy()
test_df  = df[df["datetime"] >= "2020-01-01"].copy()

# ❗ IMPORTANT: keep full DataFrames, including 'datetime' and 'target_next_hour'
X_train_full = train_df.copy()
X_test       = test_df.copy()

# Targets as Series/arrays
y_train_full = train_df["target_next_hour"].astype(float)
y_test       = test_df["target_next_hour"].astype(float)

# Time-respecting split inside the training window:
# first 80% of train_df → model training
# last 20% of train_df → post-hoc transform / validation
split_idx = int(len(X_train_full) * 0.8)

X_train = X_train_full.iloc[:split_idx].copy()
y_train = y_train_full.iloc[:split_idx].copy()

X_val   = X_train_full.iloc[split_idx:].copy()
y_val   = y_train_full.iloc[split_idx:].copy()

print(f"Train size: {len(X_train)}")
print(f"Val size:   {len(X_val)}")
print(f"Test size:  {len(X_test)}")


2025-12-06 17:23:22,755 INFO [2025-12-03T00:00:00Z] To improve our C3S service, we need to hear from you! Please complete this very short [survey](https://confluence.ecmwf.int/x/E7uBEQ/). Thank you.


Skipping existing file /Users/alfiehofmann/cis5200-project/data/era5_tehachapi_2018_H1.nc
Skipping existing file /Users/alfiehofmann/cis5200-project/data/era5_tehachapi_2018_H2.nc
Skipping existing file /Users/alfiehofmann/cis5200-project/data/era5_tehachapi_2019_H1.nc
Skipping existing file /Users/alfiehofmann/cis5200-project/data/era5_tehachapi_2019_H2.nc
Skipping existing file /Users/alfiehofmann/cis5200-project/data/era5_tehachapi_2020_H1.nc
Skipping existing file /Users/alfiehofmann/cis5200-project/data/era5_tehachapi_2020_H2.nc
Train size: 14016
Val size:   3504
Test size:  8783


In [3]:
base_models = {
    "persist": PersistenceModel(),
    "lin":     LinearRegressionModel(),
    "rf":      RandomForestModel(),
    "xgb":     XGBoostModel(),
}


In [4]:
for name, model in base_models.items():
    print(f"Training {name} ...")
    model.fit(X_train, y_train)

Training persist ...
Training lin ...
Training rf ...


  ret = a @ b
  ret = a @ b
  ret = a @ b


Training xgb ...


In [5]:
alpha_under = 2.0  # asymmetry parameter you're using in your custom loss
posthoc_models = {}

for name, model in base_models.items():
    print(f"\n=== Fitting post-hoc transform for {name} ===")

    wrapper = AsymmetricPostHocRegressor(
        base_model=model,
        alpha=alpha_under,
        lr=1e-2,
        n_steps=500,
        verbose=True,
        min_improvement=0.0,  # require strict improvement only
    )

    wrapper.fit(X_val, y_val)
    posthoc_models[name] = wrapper



=== Fitting post-hoc transform for persist ===
[AsymmetricPostHocRegressor] step=  0 loss=0.290316 a=1.0100 b=0.0100
[AsymmetricPostHocRegressor] step=100 loss=0.268942 a=0.9554 b=0.2297
[AsymmetricPostHocRegressor] step=200 loss=0.268942 a=0.9551 b=0.2309
[AsymmetricPostHocRegressor] step=300 loss=0.268942 a=0.9551 b=0.2308
[AsymmetricPostHocRegressor] step=400 loss=0.268942 a=0.9551 b=0.2308
[AsymmetricPostHocRegressor] base_loss=0.290316, posthoc_loss=0.268942
Using post-hoc transform: a=0.9551, b=0.2308

=== Fitting post-hoc transform for lin ===
[AsymmetricPostHocRegressor] step=  0 loss=0.227132 a=1.0100 b=0.0100
[AsymmetricPostHocRegressor] step=100 loss=0.216428 a=1.0026 b=0.0796
[AsymmetricPostHocRegressor] step=200 loss=0.216428 a=1.0025 b=0.0800
[AsymmetricPostHocRegressor] step=300 loss=0.216428 a=1.0025 b=0.0800
[AsymmetricPostHocRegressor] step=400 loss=0.216428 a=1.0025 b=0.0800
[AsymmetricPostHocRegressor] base_loss=0.227132, posthoc_loss=0.216428
Using post-hoc transf

  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [6]:
print("\n=== Test Set Comparison (Using Asymmetric Loss) ===")

for name, model in base_models.items():
    base_loss = eval_asymmetric_loss(model, X_test, y_test, alpha=alpha_under)
    ph_loss   = eval_asymmetric_loss(posthoc_models[name], X_test, y_test, alpha=alpha_under)

    print(
        f"{name:>7} | Base Loss: {base_loss:.4f} "
        f"| Post-hoc Loss: {ph_loss:.4f} "
        f"| Transform Used: {posthoc_models[name].use_transform_}"
    )


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_



=== Test Set Comparison (Using Asymmetric Loss) ===
persist | Base Loss: 0.3088 | Post-hoc Loss: 0.2862 | Transform Used: True
    lin | Base Loss: 0.2374 | Post-hoc Loss: 0.2211 | Transform Used: True
     rf | Base Loss: 0.2057 | Post-hoc Loss: 0.1929 | Transform Used: True
    xgb | Base Loss: 0.1739 | Post-hoc Loss: 0.1634 | Transform Used: True
