In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/content/drive/MyDrive/duong/STLF/df_fe.csv")

In [3]:
df.head()

Unnamed: 0,timestamp,Temperature,Weather,Precipitation,Chance of snow,Humidity,Wind,Wind Gust,Wind Degree,Cloud Cover,...,sin_hour_roll24,sin_doy_lag1,sin_doy_lag24,sin_doy_roll24,Wind Degree_lag1,Wind Degree_lag24,Wind Degree_roll24,wind_dir_deg,wind_dir_sin,wind_dir_cos
0,2020-01-01 00:00:00,25.4,Patchy rain possible,0.6,0.0,89.0,2.194444,4.388889,295.0,89.0,...,,,,,,,,292.5,-0.92388,0.382683
1,2020-01-01 01:00:00,25.1,Partly cloudy,0.0,0.0,90.0,2.611111,5.111111,297.0,34.0,...,,0.017202,,,295.0,,,292.5,-0.92388,0.382683
2,2020-01-01 02:00:00,24.7,Patchy rain possible,0.0,0.0,91.0,2.805556,5.5,309.0,87.0,...,,0.017202,,,297.0,,,315.0,-0.707107,0.707107
3,2020-01-01 03:00:00,24.5,Cloudy,0.0,0.0,92.0,2.611111,4.888889,325.0,71.0,...,,0.017202,,,309.0,,,315.0,-0.707107,0.707107
4,2020-01-01 04:00:00,24.1,Patchy rain possible,0.0,0.0,93.0,2.305556,4.0,326.0,100.0,...,,0.017202,,,325.0,,,337.5,-0.382683,0.92388


In [4]:
df.shape

(47449, 66)

In [5]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [6]:
df.loc[0, 'timestamp']

Timestamp('2020-01-01 00:00:00')

In [7]:
import warnings, math, random
warnings.filterwarnings("ignore")

import numpy as np
from prophet import Prophet

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [8]:
TARGET_COL = "y_tgt"
OLD_TARGET_COL = "total_consumption_mw"
VALID_HOURS = 24 * 14                  # fast holdout: last 14 days
TOPK_NUMERIC = 25                      # choose top-K numeric regressors
N_RANDOM_TRIALS = 10                   # fast random search
CAT_COLS = ["Weather", "season", "is_holiday"]

In [9]:
all_cols = df.columns.tolist()

In [12]:
X_cols = [
    c for c in df.columns
    if c not in ["timestamp", TARGET_COL, OLD_TARGET_COL] + CAT_COLS
]
num_cols = [c for c in X_cols if pd.api.types.is_numeric_dtype(df[c])]

In [13]:
num_cols

['Temperature',
 'Precipitation',
 'Chance of snow',
 'Humidity',
 'Wind',
 'Wind Gust',
 'Wind Degree',
 'Cloud Cover',
 'Visibility',
 'is_weekend',
 'hour',
 'dow',
 'month',
 'doy',
 'sin_hour',
 'cos_hour',
 'sin_doy',
 'cos_doy',
 'y_lag_1',
 'y_lag_24',
 'y_lag_168',
 'y_rollmean_3',
 'y_rollstd_3',
 'y_rollmean_24',
 'y_rollstd_24',
 'y_rollmean_168',
 'y_rollstd_168',
 'cos_hour_lag1',
 'cos_hour_lag24',
 'cos_hour_roll24',
 'Humidity_lag1',
 'Humidity_lag24',
 'Humidity_roll24',
 'Temperature_lag1',
 'Temperature_lag24',
 'Temperature_roll24',
 'cos_doy_lag1',
 'cos_doy_lag24',
 'cos_doy_roll24',
 'hour_lag1',
 'hour_lag24',
 'hour_roll24',
 'Precipitation_lag1',
 'Precipitation_lag24',
 'Precipitation_roll24',
 'Wind_lag1',
 'Wind_lag24',
 'Wind_roll24',
 'sin_hour_lag1',
 'sin_hour_lag24',
 'sin_hour_roll24',
 'sin_doy_lag1',
 'sin_doy_lag24',
 'sin_doy_roll24',
 'Wind Degree_lag1',
 'Wind Degree_lag24',
 'Wind Degree_roll24',
 'wind_dir_deg',
 'wind_dir_sin',
 'wind_dir_co

In [14]:
cat_cols = CAT_COLS

In [15]:
if len(df) <= VALID_HOURS + 100:
    raise ValueError("Not enough rows for the chosen validation window; reduce VALID_HOURS.")

In [18]:
for catcol in cat_cols:
    if catcol in df.columns:
        df[catcol] = df[catcol].astype("category")

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47449 entries, 0 to 47448
Data columns (total 66 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   timestamp             47449 non-null  datetime64[ns]
 1   Temperature           47449 non-null  float64       
 2   Weather               47449 non-null  category      
 3   Precipitation         47449 non-null  float64       
 4   Chance of snow        47449 non-null  float64       
 5   Humidity              47449 non-null  float64       
 6   Wind                  47449 non-null  float64       
 7   Wind Gust             47449 non-null  float64       
 8   Wind Degree           47449 non-null  float64       
 9   Cloud Cover           47449 non-null  float64       
 10  Visibility            47449 non-null  float64       
 11  is_weekend            47449 non-null  float64       
 12  season                47449 non-null  category      
 13  is_holiday      

In [20]:
train_df = df.iloc[:-VALID_HOURS].copy()
valid_df = df.iloc[-VALID_HOURS:].copy()

In [22]:
# Leakage-safe categorical encoding
# is_holiday -> 0/1 if boolean-like

if "is_holiday" in cat_cols:
    # robust cast
    holiday_train = pd.get_dummies(train_df["is_holiday"], prefix="is_holiday")
    holiday_valid = pd.get_dummies(valid_df["is_holiday"], prefix="is_holiday")
    # align columns
    holiday_valid = holiday_valid.reindex(columns=holiday_train.columns, fill_value=0)
    print("ok")

# season -> one-hot (train fit, apply to valid)
if "season" in cat_cols:
    season_train = pd.get_dummies(train_df["season"], prefix="season")
    season_valid = pd.get_dummies(valid_df["season"], prefix="season")
    # align columns
    season_valid = season_valid.reindex(columns=season_train.columns, fill_value=0)
    print("ok")

# Weather -> frequency encoding (train fit, apply to valid)
if "Weather" in cat_cols:
    freq_map = train_df["Weather"].value_counts(normalize=True)
    train_df["Weather_fe"] = train_df["Weather"].map(freq_map).fillna(0.0)
    valid_df["Weather_fe"] = valid_df["Weather"].map(freq_map).fillna(0.0)
else:
    train_df["Weather_fe"] = np.nan
    valid_df["Weather_fe"] = np.nan


ok
ok


In [23]:
# Numeric feature selection (top-K by |corr| with target on train)
corrs = []
for c in num_cols:
    try:
        v = train_df[c].astype(float)
        if v.notna().sum() < 10:  # skip sparse
            continue
        corr = abs(np.corrcoef(v[~v.isna()], train_df.loc[~v.isna(), TARGET_COL])[0,1])
        if not np.isfinite(corr):
            continue
        corrs.append((c, corr))
    except Exception:
        pass

corrs.sort(key=lambda x: x[1], reverse=True)
selected_numeric = [c for c, _ in corrs[:TOPK_NUMERIC]]
selected_numeric

['y_lag_24',
 'y_lag_168',
 'y_lag_1',
 'y_rollmean_3',
 'y_rollmean_24',
 'cos_hour_lag1',
 'y_rollmean_168',
 'cos_hour_lag24',
 'cos_hour',
 'y_rollstd_24',
 'y_rollstd_168',
 'Humidity',
 'Humidity_lag24',
 'Temperature',
 'Temperature_lag24',
 'cos_doy_lag24',
 'cos_doy_roll24',
 'cos_doy_lag1',
 'cos_doy',
 'Humidity_lag1',
 'Temperature_lag1',
 'wind_dir_sin',
 'hour',
 'hour_lag24',
 'wind_dir_cos']

In [25]:
# Build regressor matrix
# Start with selected numeric + encoded cats
regressor_cols = selected_numeric.copy()
if "Weather" in cat_cols:    regressor_cols.append("Weather_fe")

# Concatenate one-hot encoded categorical features back to train_df and valid_df
train_df = pd.concat([train_df, season_train, holiday_train], axis=1)
valid_df = pd.concat([valid_df, season_valid, holiday_valid], axis=1)

# add season one-hots
season_train_cols = season_train.columns.tolist()
regressor_cols += season_train_cols
# add is_holiday one-hots
holiday_train_cols = holiday_train.columns.tolist()
regressor_cols += holiday_train_cols

# Assemble X matrices
X_train = train_df[["timestamp"] + regressor_cols].copy()
X_valid = valid_df[["timestamp"] + regressor_cols].copy()

# Impute missing numeric values with train medians
medians = X_train[regressor_cols].median(numeric_only=True)
X_train[regressor_cols] = X_train[regressor_cols].fillna(medians)
X_valid[regressor_cols] = X_valid[regressor_cols].fillna(medians)

# Standardize regressors (fit on train only)
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_valid_scaled = X_valid.copy()

X_train_scaled[regressor_cols] = scaler.fit_transform(X_train[regressor_cols].values)
X_valid_scaled[regressor_cols] = scaler.transform(X_valid[regressor_cols].values)

# Reattach season one-hots (already included) for both
# (season_train / season_valid already aligned above)
# for col in season_train_cols:
#     X_train_scaled[col] = season_train[col].values
#     X_valid_scaled[col] = season_valid[col].values

# Reattach is_holiday one-hots (already included) for both
# (holiday_train / holiday_valid already aligned above)
# for col in holiday_train_cols:
#     X_train_scaled[col] = holiday_train[col].values
#     X_valid_scaled[col] = holiday_valid[col].values

# Prepare Prophet training frames (Prophet expects 'ds' and 'y')
train_prophet = pd.DataFrame({
    "ds": X_train_scaled["timestamp"].values,
    "y": train_df[TARGET_COL].values
})
valid_prophet = pd.DataFrame({
    "ds": X_valid_scaled["timestamp"].values,
    "y": valid_df[TARGET_COL].values
})

# Ensure regressor alignment
regressor_cols = [c for c in regressor_cols if c != "timestamp"]  # safety
for c in regressor_cols:
    train_prophet[c] = X_train_scaled[c].values
    valid_prophet[c] = X_valid_scaled[c].values

train_prophet.shape, valid_prophet.shape

((47113, 35), (336, 35))

In [26]:
# Metric helpers
def rmse(y_true, y_pred):
    return math.sqrt(mean_squared_error(y_true, y_pred))

def smape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    return np.mean(np.abs(y_true - y_pred) / denom) * 100.0

In [28]:
# Replace inf -> NaN (so we can impute), then impute using train medians; backfill any leftover with 0
for X in (X_train, X_valid):
    X[regressor_cols] = X[regressor_cols].replace([np.inf, -np.inf], np.nan)

medians = X_train[regressor_cols].median(numeric_only=True).fillna(0.0)
X_train[regressor_cols] = X_train[regressor_cols].fillna(medians).fillna(0.0)
X_valid[regressor_cols] = X_valid[regressor_cols].fillna(medians).fillna(0.0)

# (optional) drop truly bad columns that are still all-NaN/all-const after imputation
bad_cols = [c for c in regressor_cols if not np.isfinite(X_train[c]).all() or not np.isfinite(X_valid[c]).all()]
if bad_cols:
    print("Dropping bad columns:", bad_cols)
    regressor_cols = [c for c in regressor_cols if c not in bad_cols]
    X_train = X_train[["timestamp"] + regressor_cols]
    X_valid = X_valid[["timestamp"] + regressor_cols]


In [29]:
assert np.isfinite(X_train_scaled[regressor_cols].to_numpy()).all()
assert np.isfinite(X_valid_scaled[regressor_cols].to_numpy()).all()

In [31]:
# Random search over Prophet hyperparams

param_space = {
    "changepoint_prior_scale": lambda: 10 ** np.random.uniform(-2.5, -0.2),   # ~[0.003, 0.63]
    "seasonality_prior_scale": lambda: 10 ** np.random.uniform(-2.0, 1.0),    # ~[0.01, 10]
    "seasonality_mode":        lambda: random.choice(["additive", "multiplicative"]),
    "n_changepoints":          lambda: random.choice([10, 20, 30, 50]),
}

def fit_and_score(params):
    m = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=True,
        daily_seasonality=True,
        seasonality_mode=params["seasonality_mode"],
        changepoint_prior_scale=params["changepoint_prior_scale"],
        seasonality_prior_scale=params["seasonality_prior_scale"],
        n_changepoints=params["n_changepoints"],
    )
    for col in regressor_cols:
        m.add_regressor(col)

    m.fit(train_prophet)
    fcst = m.predict(valid_prophet[["ds"] + regressor_cols])

    y_true = valid_prophet["y"].to_numpy().astype(float)
    y_pred = fcst["yhat"].to_numpy().astype(float)

    # Filter out any rows with NaN/Inf (should be none after A), but be defensive
    ok = np.isfinite(y_true) & np.isfinite(y_pred)
    if ok.sum() < len(y_true):
        print(f"[warn] filtered {len(y_true)-ok.sum()} invalid rows in validation (NaN/Inf in y or yhat)")
        y_true = y_true[ok]
        y_pred = y_pred[ok]

    return {
        "rmse": rmse(y_true, y_pred),
        "smape": smape(y_true, y_pred),
        "model": m,
        "params": params,
    }

best = None
for i in range(N_RANDOM_TRIALS):
    trial_params = {k: gen() for k, gen in param_space.items()}
    res = fit_and_score(trial_params)
    if (best is None) or (res["rmse"] < best["rmse"]):
        best = res
    print(f"[{i+1}/{N_RANDOM_TRIALS}] RMSE={res['rmse']:.3f} | sMAPE={res['smape']:.2f}% | {res['params']}")

print("\n=== Best on validation ===")
print(f"RMSE={best['rmse']:.3f} | sMAPE={best['smape']:.2f}%")
print(f"Params: {best['params']}")
print(f"Num regressors used: {len(regressor_cols)}  -> {regressor_cols[:10]}{' ...' if len(regressor_cols)>10 else ''}")


DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/qks0e8fg.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/bosrkjw2.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=59809', 'data', 'file=/tmp/tmpehrcx7vj/qks0e8fg.json', 'init=/tmp/tmpehrcx7vj/bosrkjw2.json', 'output', 'file=/tmp/tmpehrcx7vj/prophet_modelyf9xqds0/prophet_model-20250822073111.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
07:31:11 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
07:31:47 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


[warn] filtered 1 invalid rows in validation (NaN/Inf in y or yhat)
[1/10] RMSE=411.129 | sMAPE=11.12% | {'changepoint_prior_scale': 0.006123384461095645, 'seasonality_prior_scale': 0.12836809436161045, 'seasonality_mode': 'additive', 'n_changepoints': 10}


DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/an509g_v.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/irfaowow.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=46434', 'data', 'file=/tmp/tmpehrcx7vj/an509g_v.json', 'init=/tmp/tmpehrcx7vj/irfaowow.json', 'output', 'file=/tmp/tmpehrcx7vj/prophet_modeliqrkd0yb/prophet_model-20250822073155.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
07:31:55 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
07:33:23 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


[warn] filtered 1 invalid rows in validation (NaN/Inf in y or yhat)
[2/10] RMSE=412.725 | sMAPE=11.08% | {'changepoint_prior_scale': 0.4008571929439228, 'seasonality_prior_scale': 0.02128304757004901, 'seasonality_mode': 'multiplicative', 'n_changepoints': 50}


DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/iakqsc0t.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/571d3u_r.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=2668', 'data', 'file=/tmp/tmpehrcx7vj/iakqsc0t.json', 'init=/tmp/tmpehrcx7vj/571d3u_r.json', 'output', 'file=/tmp/tmpehrcx7vj/prophet_modelhn5pud_0/prophet_model-20250822073330.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
07:33:30 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
07:34:20 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


[warn] filtered 1 invalid rows in validation (NaN/Inf in y or yhat)
[3/10] RMSE=410.946 | sMAPE=11.06% | {'changepoint_prior_scale': 0.1220143362571795, 'seasonality_prior_scale': 0.01188110048160375, 'seasonality_mode': 'additive', 'n_changepoints': 30}


DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/6tomqpxg.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/cjs_x3ph.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=80897', 'data', 'file=/tmp/tmpehrcx7vj/6tomqpxg.json', 'init=/tmp/tmpehrcx7vj/cjs_x3ph.json', 'output', 'file=/tmp/tmpehrcx7vj/prophet_modelxb2sjs9b/prophet_model-20250822073427.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
07:34:27 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
07:35:23 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


[warn] filtered 1 invalid rows in validation (NaN/Inf in y or yhat)
[4/10] RMSE=412.554 | sMAPE=11.07% | {'changepoint_prior_scale': 0.2812092887429654, 'seasonality_prior_scale': 0.18012791934555414, 'seasonality_mode': 'multiplicative', 'n_changepoints': 10}


DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/fas9ep1m.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/xvj9mzgn.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=19698', 'data', 'file=/tmp/tmpehrcx7vj/fas9ep1m.json', 'init=/tmp/tmpehrcx7vj/xvj9mzgn.json', 'output', 'file=/tmp/tmpehrcx7vj/prophet_model4gx6kque/prophet_model-20250822073531.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
07:35:31 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
07:36:24 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


[warn] filtered 1 invalid rows in validation (NaN/Inf in y or yhat)
[5/10] RMSE=411.217 | sMAPE=11.04% | {'changepoint_prior_scale': 0.026692890057013214, 'seasonality_prior_scale': 0.2295469344632916, 'seasonality_mode': 'additive', 'n_changepoints': 20}


DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/p5qbppd7.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/245j7uq7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=45196', 'data', 'file=/tmp/tmpehrcx7vj/p5qbppd7.json', 'init=/tmp/tmpehrcx7vj/245j7uq7.json', 'output', 'file=/tmp/tmpehrcx7vj/prophet_modelnufxr4qe/prophet_model-20250822073630.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
07:36:30 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
07:37:51 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


[warn] filtered 1 invalid rows in validation (NaN/Inf in y or yhat)
[6/10] RMSE=411.293 | sMAPE=11.03% | {'changepoint_prior_scale': 0.05236242634977407, 'seasonality_prior_scale': 0.3977075370680161, 'seasonality_mode': 'additive', 'n_changepoints': 50}


DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/vufndqwl.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/me3sbq7e.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=63684', 'data', 'file=/tmp/tmpehrcx7vj/vufndqwl.json', 'init=/tmp/tmpehrcx7vj/me3sbq7e.json', 'output', 'file=/tmp/tmpehrcx7vj/prophet_modelwsofc_ga/prophet_model-20250822073759.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
07:37:59 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
07:38:51 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


[warn] filtered 1 invalid rows in validation (NaN/Inf in y or yhat)
[7/10] RMSE=411.018 | sMAPE=11.06% | {'changepoint_prior_scale': 0.11715660679534537, 'seasonality_prior_scale': 0.012727757002009748, 'seasonality_mode': 'additive', 'n_changepoints': 50}


DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/cb2y65ab.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/zjzvdsy_.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=819', 'data', 'file=/tmp/tmpehrcx7vj/cb2y65ab.json', 'init=/tmp/tmpehrcx7vj/zjzvdsy_.json', 'output', 'file=/tmp/tmpehrcx7vj/prophet_modelf1fyvab1/prophet_model-20250822073900.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
07:39:00 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
07:39:46 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


[warn] filtered 1 invalid rows in validation (NaN/Inf in y or yhat)
[8/10] RMSE=411.902 | sMAPE=11.20% | {'changepoint_prior_scale': 0.003845873646532195, 'seasonality_prior_scale': 0.10542204131170996, 'seasonality_mode': 'additive', 'n_changepoints': 30}


DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/l3qezc2b.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/ucgraoap.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=86149', 'data', 'file=/tmp/tmpehrcx7vj/l3qezc2b.json', 'init=/tmp/tmpehrcx7vj/ucgraoap.json', 'output', 'file=/tmp/tmpehrcx7vj/prophet_modelt6k5vy8d/prophet_model-20250822073955.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
07:39:55 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
07:41:01 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


[warn] filtered 1 invalid rows in validation (NaN/Inf in y or yhat)
[9/10] RMSE=412.934 | sMAPE=11.07% | {'changepoint_prior_scale': 0.27730502161020787, 'seasonality_prior_scale': 0.025590344027879186, 'seasonality_mode': 'multiplicative', 'n_changepoints': 50}


DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/nk1z0rug.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/rw3u_itg.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=38795', 'data', 'file=/tmp/tmpehrcx7vj/nk1z0rug.json', 'init=/tmp/tmpehrcx7vj/rw3u_itg.json', 'output', 'file=/tmp/tmpehrcx7vj/prophet_modelt3ge4gxb/prophet_model-20250822074109.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
07:41:09 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
07:42:04 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


[warn] filtered 1 invalid rows in validation (NaN/Inf in y or yhat)
[10/10] RMSE=411.254 | sMAPE=11.12% | {'changepoint_prior_scale': 0.01023426838881943, 'seasonality_prior_scale': 0.02670917835123553, 'seasonality_mode': 'additive', 'n_changepoints': 50}

=== Best on validation ===
RMSE=410.946 | sMAPE=11.06%
Params: {'changepoint_prior_scale': 0.1220143362571795, 'seasonality_prior_scale': 0.01188110048160375, 'seasonality_mode': 'additive', 'n_changepoints': 30}
Num regressors used: 33  -> ['y_lag_24', 'y_lag_168', 'y_lag_1', 'y_rollmean_3', 'y_rollmean_24', 'cos_hour_lag1', 'y_rollmean_168', 'cos_hour_lag24', 'cos_hour', 'y_rollstd_24'] ...


In [33]:
# refit best model on full data (train+valid)
# Rebuild full frame with same transformations to avoid leakage
full_df = df.copy()

# holiday one-hot: align to train one-hot columns
if "is_holiday" in cat_cols:
    holiday_full = pd.get_dummies(full_df["is_holiday"], prefix="is_holiday")
    holiday_full = holiday_full.reindex(columns=holiday_train_cols, fill_value=0)

# season one-hot: align to train one-hot columns
if "season" in cat_cols:
    season_full = pd.get_dummies(full_df["season"], prefix="season")
    season_full = season_full.reindex(columns=season_train_cols, fill_value=0)

# Weather frequency encoding using train mapping (freq_map)
if "Weather" in cat_cols:
    full_df["Weather_fe"] = full_df["Weather"].map(freq_map).fillna(0.0)
else:
    full_df["Weather_fe"] = np.nan

# Concatenate one-hot encoded categorical features back to full_df
full_df = pd.concat([full_df, season_full, holiday_full], axis=1)

# 2) Build X_full (ds + regs), impute, standardize with TRAIN medians/scaler
X_full = full_df[["timestamp"] + regressor_cols].copy()

# replace inf -> NaN, then impute with train medians; fallback to 0.0
X_full[regressor_cols] = X_full[regressor_cols].replace([np.inf, -np.inf], np.nan)
X_full[regressor_cols] = X_full[regressor_cols].fillna(medians).fillna(0.0)

# standardize using TRAIN scaler
X_full_scaled = X_full.copy()
X_full_scaled[regressor_cols] = scaler.transform(X_full[regressor_cols].values)

# reattach season one-hots if present in regressor_cols
# for col in season_train_cols:
#     if col in regressor_cols:
#         X_full_scaled[col] = season_full[col].values

# reattach is_holiday one-hots if present in regressor_cols
# for col in holiday_train_cols:
#     if col in regressor_cols:
#         X_full_scaled[col] = holiday_full[col].values

# sanity checks
assert np.isfinite(X_full_scaled[regressor_cols].to_numpy()).all(), "Non-finite values remain in regressors."


# Build Prophet training frame for full data
full_prophet = pd.DataFrame({
    "ds": X_full_scaled["timestamp"].values,
    "y": full_df[TARGET_COL].values
})
for c in regressor_cols:
    full_prophet[c] = X_full_scaled[c].values

m_best = Prophet(
    yearly_seasonality=True,
    weekly_seasonality=True,
    daily_seasonality=True,
    seasonality_mode=best["params"]["seasonality_mode"],
    changepoint_prior_scale=best["params"]["changepoint_prior_scale"],
    seasonality_prior_scale=best["params"]["seasonality_prior_scale"],
    n_changepoints=best["params"]["n_changepoints"],
)
for col in regressor_cols:
    m_best.add_regressor(col)

m_best.fit(full_prophet)

print("Refit complete on full dataset.")
print(f"Final regressor count: {len(regressor_cols)}")

DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/q4teuhqt.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpehrcx7vj/y_g594ja.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=64', 'data', 'file=/tmp/tmpehrcx7vj/q4teuhqt.json', 'init=/tmp/tmpehrcx7vj/y_g594ja.json', 'output', 'file=/tmp/tmpehrcx7vj/prophet_modelqq9vgcg2/prophet_model-20250822075725.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
07:57:25 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
07:58:11 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


Refit complete on full dataset.
Final regressor count: 33


In [35]:
# quick in-sample fit diagnostics
fcst_full = m_best.predict(full_prophet[["ds"] + regressor_cols])

# Remove rows with NaN from full_prophet before calculating metrics
full_prophet_cleaned = full_prophet.dropna(subset=['y'])

y_true = full_prophet_cleaned["y"].to_numpy().astype(float)
# Align fcst_full predictions with the cleaned full_prophet DataFrame
y_pred = fcst_full.loc[full_prophet_cleaned.index, "yhat"].to_numpy().astype(float)


def _rmse(a, b):
    from sklearn.metrics import mean_squared_error
    import math
    return math.sqrt(mean_squared_error(a, b))
def _smape(a, f, eps=1e-8):
    a = np.asarray(a, float); f = np.asarray(f, float)
    denom = (np.abs(a) + np.abs(f) + eps)/2.0
    return np.mean(np.abs(a - f) / denom) * 100.0

print(f"Full in-sample RMSE:  {_rmse(y_true, y_pred):.3f}")
print(f"Full in-sample sMAPE: {_smape(y_true, y_pred):.2f}%")

Full in-sample RMSE:  406.944
Full in-sample sMAPE: 11.14%


In [36]:
try:
    from prophet.serialize import model_to_json
    import json
    with open("prophet_best_full.json", "w") as f:
        json.dump(model_to_json(m_best), f)
    print("Saved model to prophet_best_full.json")
except Exception as e:
    print(f"[warn] Could not serialize model to JSON: {e}")

Saved model to prophet_best_full.json
