In [1]:
from src.model_predict import forecast_future
from src.preprocessing import load_data, split_data, split_processed_data
from src.forecasting import RollingForecaster
from src.feature_engineering import create_features
from src.model_fit import train_models

import warnings
warnings.filterwarnings("ignore")

In [2]:
data_dict = load_data('data/round_3/*.xlsx')

In [3]:
data_dict

{'Hong Kong International Airport':           Date  Total
 0   2023-01-08  30275
 1   2023-01-09  24796
 2   2023-01-10  24012
 3   2023-01-11  23273
 4   2023-01-12  23053
 ..         ...    ...
 819 2025-04-06  78654
 820 2025-04-07  68436
 821 2025-04-08  60797
 822 2025-04-09  54164
 823 2025-04-10  55832
 
 [824 rows x 2 columns],
 'Hong Kong-Zhuhai-Macao Bridge':           Date  Total
 0   2023-01-08   5321
 1   2023-01-09   4937
 2   2023-01-10   5617
 3   2023-01-11   6191
 4   2023-01-12   6699
 ..         ...    ...
 819 2025-04-06  85363
 820 2025-04-07  33758
 821 2025-04-08  28809
 822 2025-04-09  28309
 823 2025-04-10  28153
 
 [824 rows x 2 columns],
 'Lo Wu':           Date   Total
 0   2023-02-06   30319
 1   2023-02-07   32954
 2   2023-02-08   35022
 3   2023-02-09   34764
 4   2023-02-10   39235
 ..         ...     ...
 790 2025-04-06  137628
 791 2025-04-07   92555
 792 2025-04-08   82638
 793 2025-04-09   81359
 794 2025-04-10   80281
 
 [795 rows x 2 columns],
 '

In [4]:
processed_data_dict = {}
for name, data in data_dict.items():
    processed_data = create_features(data)
    processed_data_dict[name] = processed_data

In [5]:
processed_data_dict

{'Hong Kong International Airport':           Date  Total  year  month  day  weekday  quarter  is_weekend  \
 60  2023-03-09  31792  2023      3    9        3        1           0   
 61  2023-03-10  33559  2023      3   10        4        1           0   
 62  2023-03-11  33421  2023      3   11        5        1           1   
 63  2023-03-12  37163  2023      3   12        6        1           1   
 64  2023-03-13  35531  2023      3   13        0        1           0   
 ..         ...    ...   ...    ...  ...      ...      ...         ...   
 819 2025-04-06  78654  2025      4    6        6        2           1   
 820 2025-04-07  68436  2025      4    7        0        2           0   
 821 2025-04-08  60797  2025      4    8        1        2           0   
 822 2025-04-09  54164  2025      4    9        2        2           0   
 823 2025-04-10  55832  2025      4   10        3        2           0   
 
      is_holiday    lag_1  ...   lag_60  rolling_7_mean  rolling_7_std  \
 

In [6]:
data_dict

{'Hong Kong International Airport':           Date  Total
 0   2023-01-08  30275
 1   2023-01-09  24796
 2   2023-01-10  24012
 3   2023-01-11  23273
 4   2023-01-12  23053
 ..         ...    ...
 819 2025-04-06  78654
 820 2025-04-07  68436
 821 2025-04-08  60797
 822 2025-04-09  54164
 823 2025-04-10  55832
 
 [824 rows x 2 columns],
 'Hong Kong-Zhuhai-Macao Bridge':           Date  Total
 0   2023-01-08   5321
 1   2023-01-09   4937
 2   2023-01-10   5617
 3   2023-01-11   6191
 4   2023-01-12   6699
 ..         ...    ...
 819 2025-04-06  85363
 820 2025-04-07  33758
 821 2025-04-08  28809
 822 2025-04-09  28309
 823 2025-04-10  28153
 
 [824 rows x 2 columns],
 'Lo Wu':           Date   Total
 0   2023-02-06   30319
 1   2023-02-07   32954
 2   2023-02-08   35022
 3   2023-02-09   34764
 4   2023-02-10   39235
 ..         ...     ...
 790 2025-04-06  137628
 791 2025-04-07   92555
 792 2025-04-08   82638
 793 2025-04-09   81359
 794 2025-04-10   80281
 
 [795 rows x 2 columns],
 '

In [7]:
import joblib
import os
os.makedirs('models/round3', exist_ok=True)

In [8]:
for name, processed_data in processed_data_dict.items():
    X_train, X_test, y_train, y_test = split_processed_data(processed_data)
    best_models, stacking, mape = train_models(X_train, y_train, X_test, y_test)
    print(f'{name}:{mape}')
        # 找到当前口岸MAPE最小的模型
    best_model_name = min(mape, key=mape.get)
    # 保存最优模型（包括stacking）
    if best_model_name == 'stacking':
        best_model = stacking
    else:
        best_model = best_models[best_model_name]
    # 生成带口岸名称的文件名
    filename = f"models/round3/{name}_best_{best_model_name}.pkl"
    joblib.dump(best_model, filename)
    # 额外单独保存stacking模型
    stacking_filename = f"models/round3/{name}_stacking.pkl"
    joblib.dump(stacking, stacking_filename)
    print(f"{name} 最佳模型已保存：{filename} (MAPE={mape[best_model_name]:.4f})")

Hong Kong International Airport:{'lgbm': 0.04872523448973291, 'catboost': 0.04840218734105161, 'random_forest': 0.051252944960489676, 'stacking': 0.05009125530551329}
Hong Kong International Airport 最佳模型已保存：models/round3/Hong Kong International Airport_best_catboost.pkl (MAPE=0.0484)
Hong Kong-Zhuhai-Macao Bridge:{'lgbm': 0.08622027187946481, 'catboost': 0.08489536268322323, 'random_forest': 0.08225237828776302, 'stacking': 0.0817905837614311}
Hong Kong-Zhuhai-Macao Bridge 最佳模型已保存：models/round3/Hong Kong-Zhuhai-Macao Bridge_best_stacking.pkl (MAPE=0.0818)
Lo Wu:{'lgbm': 0.07077361512146765, 'catboost': 0.0669536293858709, 'random_forest': 0.07372636419613095, 'stacking': 0.0689359610138944}
Lo Wu 最佳模型已保存：models/round3/Lo Wu_best_catboost.pkl (MAPE=0.0670)
Lok Ma Chau Spur Line:{'lgbm': 0.0679409043205806, 'catboost': 0.07074008624435238, 'random_forest': 0.06713955176760707, 'stacking': 0.07936022417137974}
Lok Ma Chau Spur Line 最佳模型已保存：models/round3/Lok Ma Chau Spur Line_best_random_f

In [13]:
models_path = ['models/round3/Hong Kong International Airport_best_catboost.pkl','models/round3/Hong Kong-Zhuhai-Macao Bridge_best_stacking.pkl', 'models/round3/Lo Wu_best_catboost.pkl',  'models/round3/Lok Ma Chau Spur Line_best_random_forest.pkl', 'models/round3/Shenzhen Bay_best_catboost.pkl']
for (name, processed_data), model_path in zip(processed_data_dict.items(), models_path):
    model = joblib.load(model_path)
    print(forecast_future(processed_data, model, 5)['Total'])

Date
2025-04-11    58604.086514
2025-04-12    59084.798842
2025-04-13    65497.235105
2025-04-14    60509.029150
2025-04-15    53631.722966
Name: Total, dtype: float64
Date
2025-04-11    31108.982878
2025-04-12    33492.078068
2025-04-13    54141.532420
2025-04-14    33948.528711
2025-04-15    27651.562428
Name: Total, dtype: float64
Date
2025-04-11     80556.080883
2025-04-12    100483.578965
2025-04-13    122053.032521
2025-04-14     86988.822078
2025-04-15     77943.044750
Name: Total, dtype: float64
Date
2025-04-11     67028.626455
2025-04-12     92394.203973
2025-04-13    107455.128736
2025-04-14     70678.876800
2025-04-15     65724.074407
Name: Total, dtype: float64
Date
2025-04-11    53712.662825
2025-04-12    68172.510619
2025-04-13    82515.560143
2025-04-14    46829.530983
2025-04-15    43532.439247
Name: Total, dtype: float64


In [10]:
from statsforecast import StatsForecast
import numpy as np
import pandas as pd

In [12]:
for name, data in data_dict.items():
    train, test = split_data(data)
    forecaster = RollingForecaster(train, test, steps=[3])
    metrics, predictions, ensemble_model = forecaster.run()
    # 选择MAPE最小的模型作为最优模型
    avg_metrics = metrics.mean()
    print(metrics)
    forecaster.best_model_name = avg_metrics.idxmin()

    # 新增预测参数
    total_horizon = 5  # 需要预测的总天数
    current_step = 3   # 从验证时使用的步长

    if forecaster.best_model_name == 'Ensemble':
        forecaster.best_model = ensemble_model
        model_preds = []
        data_with_id = data.copy()
        data_with_id.insert(0, 'unique_id', 1)

        # 分步滚动预测
        remaining = total_horizon
        while remaining > 0:
            h = min(current_step, remaining)
            # 各模型预测当前步长
            step_preds = []
            for model in forecaster.models:
                fcst = StatsForecast(models=[model], freq='D')
                pred = fcst.forecast(df=data_with_id, h=h,
                                   time_col='Date', target_col='Total')
                step_preds.append(pred[model.__class__.__name__].values)

            # 生成集成预测
            X_stack = np.column_stack(step_preds)
            final_pred_step = forecaster.best_model.predict(X_stack)[:h]  # 取实际需要的步长

            # 保存预测结果并更新数据
            model_preds.extend(final_pred_step)
            # 生成新的日期索引
            last_date = data_with_id['Date'].max()
            new_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=h)
            # 创建新数据用于后续预测
            new_data = pd.DataFrame({
                'Date': new_dates,
                'unique_id': 1,
                'Total': final_pred_step
            })
            data_with_id = pd.concat([data_with_id, new_data])
            remaining -= h

        final_pred = np.array(model_preds)[:total_horizon]  # 确保精确5天
    else:
        forecaster.best_model = forecaster.models[forecaster.model_names.index(
            forecaster.best_model_name)]
        # 基础模型滚动预测
        remaining = total_horizon
        final_pred = []
        data_for_pred = data.copy()

        while remaining > 0:
            h = min(current_step, remaining)
            pred = forecaster.best_model.predict(df=data_for_pred, h=h,
                                               time_col='Date', target_col='Total')
            final_pred.extend(pred)
            # 更新数据用于下次预测
            last_date = data_for_pred['Date'].max()
            new_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=h)
            new_data = pd.DataFrame({
                'Date': new_dates,
                'Total': pred
            })
            data_for_pred = pd.concat([data_for_pred, new_data])
            remaining -= h

        final_pred = np.array(final_pred)[:total_horizon]

    print(f'{name}: {final_pred}')

        ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta  Ensemble
3  0.053487  0.064292   0.055792               0.056288   0.02567
Hong Kong International Airport: [61638.44218651 61835.19225108 68510.40499603 64067.42073103
 56879.65802165]
        ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta  Ensemble
3  0.170194  0.124443   0.109409               0.132245  0.056057
Hong Kong-Zhuhai-Macao Bridge: [28274.63590493 38391.00648204 60018.98146825 36142.28677742
 28766.24719351]
        ETS   SARIMA  AutoTBATS  DynamicOptimizedTheta  Ensemble
3  0.101758  0.08125   0.082066               0.096047  0.046044
Lo Wu: [ 84782.82959361  99623.55961652 131176.70409199  86428.98575423
  79106.00937036]
        ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta  Ensemble
3  0.093009  0.081959   0.077441               0.091957  0.035724
Lok Ma Chau Spur Line: [ 68875.61301154  96722.49533333 108914.34728571  69517.15913138
  68713.49384124]
        ETS   SARIMA  AutoTBATS  DynamicOptimizedTheta  En