# Data setup

In [1]:
import numpy as np, pandas as pd
from prophet import Prophet
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

df0 = pd.read_csv('/content/drive/MyDrive/duong/STLF/Data/merge_weather_energy_hanoi_20202025_cleaned.csv', parse_dates=["timestamp"])
df0

Unnamed: 0,Temperature,Weather,Precipitation,Chance of snow,Humidity,Wind,Wind Gust,Wind Degree,Wind Direction,Cloud Cover,Visibility,timestamp,is_weekend,season,is_holiday,total_consumption_mw
0,25.4,Patchy rain possible,0.6,0.0,89.0,2.194444,4.388889,295.0,WNW,89.0,9.0,2020-01-01 00:00:00,0,winter,False,1790.10
1,25.1,Partly cloudy,0.0,0.0,90.0,2.611111,5.111111,297.0,WNW,34.0,10.0,2020-01-01 01:00:00,0,winter,False,1452.26
2,24.7,Patchy rain possible,0.0,0.0,91.0,2.805556,5.500000,309.0,NW,87.0,10.0,2020-01-01 02:00:00,0,winter,False,1483.75
3,24.5,Cloudy,0.0,0.0,92.0,2.611111,4.888889,325.0,NW,71.0,10.0,2020-01-01 03:00:00,0,winter,False,1890.07
4,24.1,Patchy rain possible,0.0,0.0,93.0,2.305556,4.000000,326.0,NNW,100.0,10.0,2020-01-01 04:00:00,0,winter,False,1371.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47444,26.8,Partly cloudy,0.0,0.0,84.0,1.888889,3.388889,109.0,ESE,56.0,10.0,2025-05-30 20:00:00,0,spring,False,2449.28
47445,26.5,Partly cloudy,0.0,0.0,86.0,2.388889,4.194444,126.0,SE,52.0,10.0,2025-05-30 21:00:00,0,spring,False,2554.05
47446,26.1,Patchy rain possible,0.0,0.0,88.0,2.388889,4.194444,149.0,SSE,84.0,10.0,2025-05-30 22:00:00,0,spring,False,1895.41
47447,25.6,Patchy rain possible,0.5,0.0,91.0,2.305556,4.111111,152.0,SSE,76.0,9.0,2025-05-30 23:00:00,0,spring,False,1558.67


# Feature engineering & Data preprocessing

In [None]:
CAT_COLS   = ['Weather','Wind Direction','season','is_holiday']
EXOG_NUM   = ['Temperature','Precipitation','Humidity',
              'Wind','Wind Gust','Wind Degree','Cloud Cover','Visibility']
HORIZON    = 24                       # forecast t+1…t+24
TARGET_LAGS   = range(1,49)           # lags of target (1…48)
ROLL_WINDOWS  = [3,6,12,24,48]        # rolling windows
EXOG_LAGS     = [0,1,3,6,12,24]       # lags for exogenous vars
TARGET_COL = 'total_consumption_mw'
TIME_COL   = 'timestamp'

In [None]:
df0[TIME_COL] = pd.to_datetime(df0[TIME_COL])
df0 = df0.sort_values(TIME_COL).reset_index(drop=True)

In [None]:
for c in CAT_COLS: df0[c] = df0[c].astype(str) # make sure categoricals are strings

In [None]:
# calendar & cyclical time
df0['hour']      = df0[TIME_COL].dt.hour
df0['dow']       = df0[TIME_COL].dt.dayofweek
df0['dom']       = df0[TIME_COL].dt.day
df0['month']     = df0[TIME_COL].dt.month
df0['doy']       = df0[TIME_COL].dt.dayofyear
df0['hour_sin']  = np.sin(2*np.pi*df0['hour']/24);     df0['hour_cos'] = np.cos(2*np.pi*df0['hour']/24)
df0['dow_sin']   = np.sin(2*np.pi*df0['dow']/7);       df0['dow_cos']  = np.cos(2*np.pi*df0['dow']/7)
df0['doy_sin']   = np.sin(2*np.pi*df0['doy']/365.25);  df0['doy_cos']  = np.cos(2*np.pi*df0['doy']/365.25)


In [None]:
# -----------------------------
# 2) Exogenous lags (known-ahead)
# -----------------------------
df = df0.copy()
for col in EXOG_NUM:
    for L in EXOG_LAGS:
        df[f'{col}_lag{L}'] = df[col].shift(L)

In [None]:
# 3) Target-derived FE (train-time only)
#    We'll compute y_lag_* and y_roll* on the full df for training.
#    At inference we will recompute these recursively.
# -----------------------------
for L in TARGET_LAGS:
    df[f'y_lag_{L}'] = df[TARGET_COL].shift(L)

for W in ROLL_WINDOWS:
    df[f'y_rollmean_{W}'] = df[TARGET_COL].shift(1).rolling(W, min_periods=1).mean()
    df[f'y_rollstd_{W}']  = df[TARGET_COL].shift(1).rolling(W, min_periods=1).std()

  df[f'y_lag_{L}'] = df[TARGET_COL].shift(L)
  df[f'y_rollmean_{W}'] = df[TARGET_COL].shift(1).rolling(W, min_periods=1).mean()
  df[f'y_rollstd_{W}']  = df[TARGET_COL].shift(1).rolling(W, min_periods=1).std()
  df[f'y_rollmean_{W}'] = df[TARGET_COL].shift(1).rolling(W, min_periods=1).mean()
  df[f'y_rollstd_{W}']  = df[TARGET_COL].shift(1).rolling(W, min_periods=1).std()
  df[f'y_rollmean_{W}'] = df[TARGET_COL].shift(1).rolling(W, min_periods=1).mean()
  df[f'y_rollstd_{W}']  = df[TARGET_COL].shift(1).rolling(W, min_periods=1).std()
  df[f'y_rollmean_{W}'] = df[TARGET_COL].shift(1).rolling(W, min_periods=1).mean()
  df[f'y_rollstd_{W}']  = df[TARGET_COL].shift(1).rolling(W, min_periods=1).std()
  df[f'y_rollmean_{W}'] = df[TARGET_COL].shift(1).rolling(W, min_periods=1).mean()
  df[f'y_rollstd_{W}']  = df[TARGET_COL].shift(1).rolling(W, min_periods=1).std()


In [None]:
df

Unnamed: 0,Temperature,Weather,Precipitation,Chance of snow,Humidity,Wind,Wind Gust,Wind Degree,Wind Direction,Cloud Cover,...,y_rollmean_3,y_rollstd_3,y_rollmean_6,y_rollstd_6,y_rollmean_12,y_rollstd_12,y_rollmean_24,y_rollstd_24,y_rollmean_48,y_rollstd_48
0,25.4,Patchy rain possible,0.6,0.0,89.0,2.194444,4.388889,295.0,WNW,89.0,...,,,,,,,,,,
1,25.1,Partly cloudy,0.0,0.0,90.0,2.611111,5.111111,297.0,WNW,34.0,...,1790.100000,,1790.100000,,1790.100000,,1790.100000,,1790.100000,
2,24.7,Patchy rain possible,0.0,0.0,91.0,2.805556,5.500000,309.0,NW,87.0,...,1621.180000,238.888955,1621.180000,238.888955,1621.180000,238.888955,1621.180000,238.888955,1621.180000,238.888955
3,24.5,Cloudy,0.0,0.0,92.0,2.611111,4.888889,325.0,NW,71.0,...,1575.370000,186.626993,1575.370000,186.626993,1575.370000,186.626993,1575.370000,186.626993,1575.370000,186.626993
4,24.1,Patchy rain possible,0.0,0.0,93.0,2.305556,4.000000,326.0,NNW,100.0,...,1608.693333,244.187482,1654.045000,219.040587,1654.045000,219.040587,1654.045000,219.040587,1654.045000,219.040587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47444,26.8,Partly cloudy,0.0,0.0,84.0,1.888889,3.388889,109.0,ESE,56.0,...,2596.690000,234.859275,3268.903333,755.643532,3697.369167,895.278483,2733.392500,1221.912979,2733.948542,1173.487521
47445,26.5,Partly cloudy,0.0,0.0,86.0,2.388889,4.194444,126.0,SE,52.0,...,2471.866667,129.901172,2998.636667,699.333561,3667.770833,932.576403,2720.084583,1223.250579,2728.695417,1174.200347
47446,26.1,Patchy rain possible,0.0,0.0,88.0,2.388889,4.194444,149.0,SSE,84.0,...,2538.303333,82.287852,2767.610000,535.946673,3610.810000,980.865190,2721.832083,1222.970480,2727.537500,1174.347655
47447,25.6,Patchy rain possible,0.5,0.0,91.0,2.305556,4.111111,152.0,SSE,76.0,...,2299.580000,353.919807,2448.135000,314.085258,3382.946667,1038.544393,2717.724583,1225.683851,2723.365208,1176.993105


In [None]:
# 4) One-hot categoricals
# -----------------------------
df = pd.get_dummies(df, columns=CAT_COLS, drop_first=False)

In [None]:
# rename for Prophet
df = df.rename(columns={TIME_COL:'ds', TARGET_COL:'y'})

# drop NaNs due to lagging
df = df.dropna().reset_index(drop=True)

In [None]:
# ensure unique hourly stamps (keeps the latest if duplicates slipped in)
df = df.sort_values('ds').drop_duplicates(subset='ds', keep='last').reset_index(drop=True)

# Model Training

### 70-15-15 split

In [None]:
train_size = int(0.7 * len(df))
val_size   = int(0.15 * len(df))

train_df = df.iloc[:train_size].copy()
val_df   = df.iloc[train_size:train_size+val_size].copy()
test_df  = df.iloc[train_size+val_size:].copy()

In [None]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(33180, 157)
(7110, 157)
(7111, 157)


In [None]:
# 6) Scale regressors on train-only
# -----------------------------
all_cols   = df.columns.tolist()
reg_cols   = [c for c in all_cols if c not in ['ds','y']]
targ_lag_cols = [f'y_lag_{L}' for L in TARGET_LAGS]
targ_roll_cols= [f'y_rollmean_{W}' for W in ROLL_WINDOWS] + [f'y_rollstd_{W}' for W in ROLL_WINDOWS]

In [None]:
scaler = MinMaxScaler()
train_df[reg_cols] = scaler.fit_transform(train_df[reg_cols])
val_df[reg_cols]   = scaler.transform(val_df[reg_cols])
test_df[reg_cols]  = scaler.transform(test_df[reg_cols])

In [None]:
# Keep an unscaled base frame to fetch raw rows for future timestamps
base_unscaled = df.copy()
# (IMPORTANT) Put back unscaled values for regressors so we can transform later consistently
base_unscaled[reg_cols] = df[reg_cols].copy()  # already unscaled

In [None]:
m = Prophet(daily_seasonality=True, weekly_seasonality=True, yearly_seasonality=True)
for c in reg_cols:
    m.add_regressor(c)
m.fit(train_df[['ds','y'] + reg_cols])

DEBUG:cmdstanpy:input tempfile: /tmp/tmpkbuunpsx/m47ejc8w.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpkbuunpsx/i4arylff.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=6003', 'data', 'file=/tmp/tmpkbuunpsx/m47ejc8w.json', 'init=/tmp/tmpkbuunpsx/i4arylff.json', 'output', 'file=/tmp/tmpkbuunpsx/prophet_modelxqzjombb/prophet_model-20250924102017.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
10:20:17 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
10:21:29 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


<prophet.forecaster.Prophet at 0x7f94d911e720>

In [None]:
# 8) Recursive day-ahead prediction
#    For each daily anchor t0 in the eval window:
#    - step h=1..24:
#        * build one-row future with exog/calendar/one-hots for ts
#        * compute y_lag_* and y_roll* from history that includes
#          true past up to t0 plus previous predictions
#        * scale with train-fitted scaler
#        * predict yhat; append to history
# -----------------------------
idx_all_y = df.set_index('ds')['y']  # true y (same units as target)

# columns that are NOT target-derived (we copy these from base_unscaled for each ts)
non_target_reg_cols = [c for c in reg_cols if c not in (
    [f'y_lag_{L}' for L in TARGET_LAGS] +
    [f'y_rollmean_{W}' for W in ROLL_WINDOWS] +
    [f'y_rollstd_{W}'  for W in ROLL_WINDOWS]
)]

# base frame to fetch exog/calendar/one-hots at future stamps (unscaled)
base_unscaled = df.copy()

def compute_target_feats_from_history(y_hist: pd.Series):
    feats = {}
    # lags
    for L in TARGET_LAGS:
        feats[f'y_lag_{L}'] = float(y_hist.iloc[-L]) if len(y_hist) >= L else float(y_hist.iloc[0])
    # rollings
    for W in ROLL_WINDOWS:
        tail = y_hist.tail(W).values
        feats[f'y_rollmean_{W}'] = float(np.mean(tail))
        feats[f'y_rollstd_{W}']  = float(np.std(tail, ddof=0)) if len(tail) > 1 else 0.0
    return feats

def one_row_for_ts(ts):
    """Return a single unscaled row (ds + non_target_reg_cols) for this timestamp."""
    rb = base_unscaled.loc[base_unscaled['ds'] == ts, ['ds'] + non_target_reg_cols]
    if rb.empty:
        return None
    # If duplicates remain for some reason, take the last one
    rb = rb.tail(1).copy()
    # keep it strictly 1 row
    return rb.reset_index(drop=True)

def predict_recursive_24(m, t0):
    y_hist = idx_all_y.loc[:t0].copy()
    rows = []
    for h in range(1, HORIZON+1):
        ts = t0 + pd.Timedelta(hours=h)
        base_row = one_row_for_ts(ts)
        if base_row is None:
            break  # outside data range

        # start a 1-row frame with all regressors present (zeros template)
        row_full = pd.DataFrame([{c: 0.0 for c in reg_cols}])
        row_full.insert(0, 'ds', base_row['ds'].iloc[0])

        # fill non-target regressors from base_row
        for c in non_target_reg_cols:
            row_full.at[0, c] = float(base_row[c].iloc[0])

        # fill target-derived features from updated history
        tfeats = compute_target_feats_from_history(y_hist)
        for k, v in tfeats.items():
            if k in row_full.columns:
                row_full.at[0, k] = v

        # scale regressors with train-fitted scaler (Prophet ignores y column here)
        row_scaled = row_full.copy()
        row_scaled[reg_cols] = scaler.transform(row_full[reg_cols])

        # 1-step forecast
        fcst = m.predict(row_scaled[['ds'] + reg_cols])
        yhat = float(fcst['yhat'].iloc[0])

        # update history with prediction for next step's lags/rollings
        y_hist = pd.concat([y_hist, pd.Series([yhat], index=[ts])])

        # stash result (and true y if available)
        y_true = float(idx_all_y.loc[ts]) if ts in idx_all_y.index else np.nan
        rows.append({'ds': ts, 'h': h, 'y_true': y_true, 'y_pred': yhat})
    return pd.DataFrame(rows)

# roll daily anchors across TEST window
records = []
ds_min = test_df['ds'].min().replace(minute=0, second=0, microsecond=0)
ds_max = test_df['ds'].max()

t0 = ds_min
while t0 + pd.Timedelta(hours=HORIZON) <= ds_max:
    out = predict_recursive_24(m, t0)
    out['anchor'] = t0
    records.append(out)
    t0 += pd.Timedelta(days=1)

res_long = pd.concat(records, ignore_index=True)
res_long = res_long.dropna(subset=['y_true'])  # keep only rows we can score


[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_tre

In [None]:
# Example: evaluate on the last-8-weeks TEST window
overall_da, per_h_da, raw_da = evaluate_dayahead_24h(m, test[["ds","y"] + reg_cols], reg_cols)
print("Day-ahead (next 24h) OVERALL on TEST:\n", overall_da)
print("\nPer-horizon RMSE/MAPE (h = 1..24):")
print(per_h_da[["h","RMSE","MAPE_%"]].to_string(index=False))


  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = pd.date_range(t0, periods=24, freq="H")
  horizon = 

Day-ahead (next 24h) OVERALL on TEST:
 MAE       610.262598
RMSE      813.626625
MAPE_%     29.294539
dtype: float64

Per-horizon RMSE/MAPE (h = 1..24):
 h        RMSE    MAPE_%
 1  784.554271 26.170781
 2  875.375362 24.515444
 3  963.050162 24.436898
 4  891.946847 25.440653
 5  762.751935 23.783246
 6  522.883190 19.939593
 7  423.549514 12.812361
 8  568.456030 20.384806
 9  702.346968 21.506329
10  633.917754 18.085320
11  583.765801 15.992622
12  610.880926 20.804362
13  542.146570 22.253275
14  633.256999 30.776561
15  562.322316 30.509289
16  734.504589 43.757391
17  526.269634 31.948225
18  623.739929 34.871160
19  480.275120 24.092099
20  422.480532 18.670384
21  471.292431 20.154428
22 1318.849535 60.231867
23 1891.858041 79.065001
24 1284.378603 52.866846


  horizon = pd.date_range(t0, periods=24, freq="H")
  .apply(lambda g: metrics_df(g["y_true"], g["y_pred"]))


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
def agg(y, yhat):
    mae  = mean_absolute_error(y, yhat)
    rmse = np.sqrt(mean_squared_error(y, yhat))
    mape = np.mean(np.abs((y - yhat)/np.clip(np.abs(y), 1e-9, None))) * 100
    return mae, rmse, mape

overall = agg(res_long['y_true'], res_long['y_pred'])
per_h = (res_long.groupby('h')
         .apply(lambda g: pd.Series(agg(g['y_true'], g['y_pred']),
                                    index=['MAE','RMSE','MAPE_%']))
         .reset_index())

print("Day-ahead (recursive) OVERALL on TEST:")
print("  MAE  = {:.3f}\n  RMSE = {:.3f}\n  MAPE = {:.3f}%".format(*overall))
print("\nPer-horizon RMSE/MAPE (h=1..24):")
print(per_h[['h','RMSE','MAPE_%']].to_string(index=False))


Day-ahead (recursive) OVERALL on TEST:
  MAE  = 499.323
  RMSE = 656.673
  MAPE = 24.773%

Per-horizon RMSE/MAPE (h=1..24):
 h       RMSE    MAPE_%
 1 377.380759  9.585767
 2 549.018082 14.418439
 3 578.660982 16.219527
 4 418.426254 13.093402
 5 197.469070  6.665461
 6 318.782978 15.280399
 7 476.691558 27.359595
 8 602.976413 34.697677
 9 760.677773 44.009489
10 640.923357 37.015542
11 908.184842 48.543256
12 818.085776 38.850547
13 718.970932 30.027014
14 491.413335 21.163440
15 384.348768 11.287366
16 460.007423 13.615357
17 557.775806 16.028429
18 806.477716 22.438960
19 864.464251 25.525474
20 831.866351 28.491323
21 768.699445 28.434163
22 870.711259 35.709187
23 846.253660 30.367052
24 772.564092 25.735330


  .apply(lambda g: pd.Series(agg(g['y_true'], g['y_pred']),


### last 8 weeks as test set split

In [None]:
N         = df.shape[0]
train_end = int(N - 2*8*7*24)
valid_end = int(N - 8*7*24)

train_last8w = df.iloc[:train_end].copy()
val_last8w = df.iloc[train_end:valid_end].copy()
test_last8w = df.iloc[valid_end:].copy()

print(train_last8w.shape)
print(val_last8w.shape)
print(test_last8w.shape)

(44713, 157)
(1344, 157)
(1344, 157)


In [None]:
# 6) Scale regressors on train-only
# -----------------------------
all_cols   = df.columns.tolist()
reg_cols   = [c for c in all_cols if c not in ['ds','y']]
targ_lag_cols = [f'y_lag_{L}' for L in TARGET_LAGS]
targ_roll_cols= [f'y_rollmean_{W}' for W in ROLL_WINDOWS] + [f'y_rollstd_{W}' for W in ROLL_WINDOWS]

scaler = MinMaxScaler()
train_last8w[reg_cols] = scaler.fit_transform(train_last8w[reg_cols])
val_last8w[reg_cols]   = scaler.transform(val_last8w[reg_cols])
test_last8w[reg_cols]  = scaler.transform(test_last8w[reg_cols])

# Keep an unscaled base frame to fetch raw rows for future timestamps
base_unscaled = df.copy()
# (IMPORTANT) Put back unscaled values for regressors so we can transform later consistently
base_unscaled[reg_cols] = df[reg_cols].copy()  # already unscaled

In [None]:
m = Prophet(daily_seasonality=True, weekly_seasonality=True, yearly_seasonality=True)
for c in reg_cols:
    m.add_regressor(c)
m.fit(train_last8w[['ds','y'] + reg_cols])

DEBUG:cmdstanpy:input tempfile: /tmp/tmpkbuunpsx/foi2l9my.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpkbuunpsx/d_7sgy7y.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=48641', 'data', 'file=/tmp/tmpkbuunpsx/foi2l9my.json', 'init=/tmp/tmpkbuunpsx/d_7sgy7y.json', 'output', 'file=/tmp/tmpkbuunpsx/prophet_modelrurp9zyg/prophet_model-20250924112428.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
11:24:28 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:25:17 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


<prophet.forecaster.Prophet at 0x7f94bfa0ef90>

In [None]:
# 8) Recursive day-ahead prediction
#    For each daily anchor t0 in the eval window:
#    - step h=1..24:
#        * build one-row future with exog/calendar/one-hots for ts
#        * compute y_lag_* and y_roll* from history that includes
#          true past up to t0 plus previous predictions
#        * scale with train-fitted scaler
#        * predict yhat; append to history
# -----------------------------
idx_all_y = df.set_index('ds')['y']  # true y (same units as target)

# columns that are NOT target-derived (we copy these from base_unscaled for each ts)
non_target_reg_cols = [c for c in reg_cols if c not in (
    [f'y_lag_{L}' for L in TARGET_LAGS] +
    [f'y_rollmean_{W}' for W in ROLL_WINDOWS] +
    [f'y_rollstd_{W}'  for W in ROLL_WINDOWS]
)]

# base frame to fetch exog/calendar/one-hots at future stamps (unscaled)
base_unscaled = df.copy()

def compute_target_feats_from_history(y_hist: pd.Series):
    feats = {}
    # lags
    for L in TARGET_LAGS:
        feats[f'y_lag_{L}'] = float(y_hist.iloc[-L]) if len(y_hist) >= L else float(y_hist.iloc[0])
    # rollings
    for W in ROLL_WINDOWS:
        tail = y_hist.tail(W).values
        feats[f'y_rollmean_{W}'] = float(np.mean(tail))
        feats[f'y_rollstd_{W}']  = float(np.std(tail, ddof=0)) if len(tail) > 1 else 0.0
    return feats

def one_row_for_ts(ts):
    """Return a single unscaled row (ds + non_target_reg_cols) for this timestamp."""
    rb = base_unscaled.loc[base_unscaled['ds'] == ts, ['ds'] + non_target_reg_cols]
    if rb.empty:
        return None
    # If duplicates remain for some reason, take the last one
    rb = rb.tail(1).copy()
    # keep it strictly 1 row
    return rb.reset_index(drop=True)

def predict_recursive_24(m, t0):
    y_hist = idx_all_y.loc[:t0].copy()
    rows = []
    for h in range(1, HORIZON+1):
        ts = t0 + pd.Timedelta(hours=h)
        base_row = one_row_for_ts(ts)
        if base_row is None:
            break  # outside data range

        # start a 1-row frame with all regressors present (zeros template)
        row_full = pd.DataFrame([{c: 0.0 for c in reg_cols}])
        row_full.insert(0, 'ds', base_row['ds'].iloc[0])

        # fill non-target regressors from base_row
        for c in non_target_reg_cols:
            row_full.at[0, c] = float(base_row[c].iloc[0])

        # fill target-derived features from updated history
        tfeats = compute_target_feats_from_history(y_hist)
        for k, v in tfeats.items():
            if k in row_full.columns:
                row_full.at[0, k] = v

        # scale regressors with train-fitted scaler (Prophet ignores y column here)
        row_scaled = row_full.copy()
        row_scaled[reg_cols] = scaler.transform(row_full[reg_cols])

        # 1-step forecast
        fcst = m.predict(row_scaled[['ds'] + reg_cols])
        yhat = float(fcst['yhat'].iloc[0])

        # update history with prediction for next step's lags/rollings
        y_hist = pd.concat([y_hist, pd.Series([yhat], index=[ts])])

        # stash result (and true y if available)
        y_true = float(idx_all_y.loc[ts]) if ts in idx_all_y.index else np.nan
        rows.append({'ds': ts, 'h': h, 'y_true': y_true, 'y_pred': yhat})
    return pd.DataFrame(rows)

# roll daily anchors across TEST window
records = []
ds_min = test_df['ds'].min().replace(minute=0, second=0, microsecond=0)
ds_max = test_df['ds'].max()

t0 = ds_min
while t0 + pd.Timedelta(hours=HORIZON) <= ds_max:
    out = predict_recursive_24(m, t0)
    out['anchor'] = t0
    records.append(out)
    t0 += pd.Timedelta(days=1)

res_long = pd.concat(records, ignore_index=True)
res_long = res_long.dropna(subset=['y_true'])  # keep only rows we can score

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_trend(df)
  df['trend'] = self.predict_tre

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
def agg(y, yhat):
    mae  = mean_absolute_error(y, yhat)
    rmse = np.sqrt(mean_squared_error(y, yhat))
    mape = np.mean(np.abs((y - yhat)/np.clip(np.abs(y), 1e-9, None))) * 100
    return mae, rmse, mape

overall = agg(res_long['y_true'], res_long['y_pred'])
per_h = (res_long.groupby('h')
         .apply(lambda g: pd.Series(agg(g['y_true'], g['y_pred']),
                                    index=['MAE','RMSE','MAPE_%']))
         .reset_index())

print("Day-ahead (recursive) OVERALL on TEST:")
print("  MAE  = {:.3f}\n  RMSE = {:.3f}\n  MAPE = {:.3f}%".format(*overall))
print("\nPer-horizon RMSE/MAPE (h=1..24):")
print(per_h[['h','RMSE','MAPE_%']].to_string(index=False))


Day-ahead (recursive) OVERALL on TEST:
  MAE  = 239.806
  RMSE = 336.689
  MAPE = 10.381%

Per-horizon RMSE/MAPE (h=1..24):
 h       RMSE    MAPE_%
 1 309.930734  6.869528
 2 332.516046  7.872801
 3 289.437625  7.230710
 4 288.427710  7.840505
 5 195.901892  6.598135
 6 268.846794 10.427944
 7 216.622515  9.926959
 8 194.668289  8.326748
 9 177.409486  8.410351
10 270.253561 12.278397
11 307.770142 13.722080
12 296.818533 11.960567
13 319.467302 12.206831
14 330.880578 10.840300
15 383.187524 11.004703
16 430.543586 11.968144
17 423.021523 11.059437
18 491.396073 11.680786
19 444.214332 11.046147
20 411.018113 11.551755
21 392.727709 12.218562
22 384.772517 13.388602
23 331.444190 10.483954
24 348.800433 10.225362


  .apply(lambda g: pd.Series(agg(g['y_true'], g['y_pred']),


In [None]:
import joblib

# save
joblib.dump(m, "prophet_model.pkl")

# load
m_loaded = joblib.load("prophet_model.pkl")
