In [1]:
from src.model_predict import forecast_future
from src.preprocessing import load_data, split_data, split_processed_data
from src.forecasting import RollingForecaster
from src.feature_engineering import create_features
from src.model_fit import train_models

import warnings
warnings.filterwarnings("ignore")

In [2]:
data_dict = load_data('data/round_2/*.xlsx')

In [3]:
data_dict

{'Hong Kong International Airport':           Date  Total
 0   2023-01-08  30275
 1   2023-01-09  24796
 2   2023-01-10  24012
 3   2023-01-11  23273
 4   2023-01-12  23053
 ..         ...    ...
 818 2025-04-05  57979
 819 2025-04-06  78654
 820 2025-04-07  68436
 821 2025-04-08  60797
 822 2025-04-09  54164
 
 [823 rows x 2 columns],
 'Hong Kong-Zhuhai-Macao Bridge':           Date  Total
 0   2023-01-08   5321
 1   2023-01-09   4937
 2   2023-01-10   5617
 3   2023-01-11   6191
 4   2023-01-12   6699
 ..         ...    ...
 818 2025-04-05  57120
 819 2025-04-06  85363
 820 2025-04-07  33758
 821 2025-04-08  28809
 822 2025-04-09  28309
 
 [823 rows x 2 columns],
 'Lo Wu':           Date   Total
 0   2023-02-06   30319
 1   2023-02-07   32954
 2   2023-02-08   35022
 3   2023-02-09   34764
 4   2023-02-10   39235
 ..         ...     ...
 789 2025-04-05  114026
 790 2025-04-06  137628
 791 2025-04-07   92555
 792 2025-04-08   82638
 793 2025-04-09   81359
 
 [794 rows x 2 columns],
 '

In [4]:
processed_data_dict = {}
for name, data in data_dict.items():
    processed_data = create_features(data)
    processed_data_dict[name] = processed_data

In [5]:
processed_data_dict

{'Hong Kong International Airport':           Date  Total  year  month  day  weekday  quarter  is_weekend  \
 60  2023-03-09  31792  2023      3    9        3        1           0   
 61  2023-03-10  33559  2023      3   10        4        1           0   
 62  2023-03-11  33421  2023      3   11        5        1           1   
 63  2023-03-12  37163  2023      3   12        6        1           1   
 64  2023-03-13  35531  2023      3   13        0        1           0   
 ..         ...    ...   ...    ...  ...      ...      ...         ...   
 818 2025-04-05  57979  2025      4    5        5        2           1   
 819 2025-04-06  78654  2025      4    6        6        2           1   
 820 2025-04-07  68436  2025      4    7        0        2           0   
 821 2025-04-08  60797  2025      4    8        1        2           0   
 822 2025-04-09  54164  2025      4    9        2        2           0   
 
      is_holiday    lag_1  ...   lag_60  rolling_7_mean  rolling_7_std  \
 

In [6]:
data_dict

{'Hong Kong International Airport':           Date  Total
 0   2023-01-08  30275
 1   2023-01-09  24796
 2   2023-01-10  24012
 3   2023-01-11  23273
 4   2023-01-12  23053
 ..         ...    ...
 818 2025-04-05  57979
 819 2025-04-06  78654
 820 2025-04-07  68436
 821 2025-04-08  60797
 822 2025-04-09  54164
 
 [823 rows x 2 columns],
 'Hong Kong-Zhuhai-Macao Bridge':           Date  Total
 0   2023-01-08   5321
 1   2023-01-09   4937
 2   2023-01-10   5617
 3   2023-01-11   6191
 4   2023-01-12   6699
 ..         ...    ...
 818 2025-04-05  57120
 819 2025-04-06  85363
 820 2025-04-07  33758
 821 2025-04-08  28809
 822 2025-04-09  28309
 
 [823 rows x 2 columns],
 'Lo Wu':           Date   Total
 0   2023-02-06   30319
 1   2023-02-07   32954
 2   2023-02-08   35022
 3   2023-02-09   34764
 4   2023-02-10   39235
 ..         ...     ...
 789 2025-04-05  114026
 790 2025-04-06  137628
 791 2025-04-07   92555
 792 2025-04-08   82638
 793 2025-04-09   81359
 
 [794 rows x 2 columns],
 '

In [7]:
import joblib
import os
os.makedirs('models/round2', exist_ok=True)

In [8]:
for name, processed_data in processed_data_dict.items():
    X_train, X_test, y_train, y_test = split_processed_data(processed_data)
    best_models, stacking, mape = train_models(X_train, y_train, X_test, y_test)
    print(f'{name}:{mape}')
        # 找到当前口岸MAPE最小的模型
    best_model_name = min(mape, key=mape.get)
    # 保存最优模型（包括stacking）
    if best_model_name == 'stacking':
        best_model = stacking
    else:
        best_model = best_models[best_model_name]
    # 生成带口岸名称的文件名
    filename = f"models/round2/{name}_best_{best_model_name}.pkl"
    joblib.dump(best_model, filename)
    # 额外单独保存stacking模型
    stacking_filename = f"models/round2/{name}_stacking.pkl"
    joblib.dump(stacking, stacking_filename)
    print(f"{name} 最佳模型已保存：{filename} (MAPE={mape[best_model_name]:.4f})")

Hong Kong International Airport:{'lgbm': 0.05088193618520497, 'catboost': 0.047972455712815304, 'random_forest': 0.04730613332369717, 'stacking': 0.05150193821045774}
Hong Kong International Airport 最佳模型已保存：models/round2/Hong Kong International Airport_best_random_forest.pkl (MAPE=0.0473)
Hong Kong-Zhuhai-Macao Bridge:{'lgbm': 0.08725925686311956, 'catboost': 0.08227459259382144, 'random_forest': 0.08287476917746599, 'stacking': 0.08562526888342457}
Hong Kong-Zhuhai-Macao Bridge 最佳模型已保存：models/round2/Hong Kong-Zhuhai-Macao Bridge_best_catboost.pkl (MAPE=0.0823)
Lo Wu:{'lgbm': 0.07911219555567771, 'catboost': 0.07239533490739222, 'random_forest': 0.07984604615408192, 'stacking': 0.07744433495988719}
Lo Wu 最佳模型已保存：models/round2/Lo Wu_best_catboost.pkl (MAPE=0.0724)
Lok Ma Chau Spur Line:{'lgbm': 0.06839686903914778, 'catboost': 0.06274701847413522, 'random_forest': 0.06379522217580012, 'stacking': 0.07254226907688935}
Lok Ma Chau Spur Line 最佳模型已保存：models/round2/Lok Ma Chau Spur Line_best

In [11]:
models_path = ['models/round2/Hong Kong International Airport_stacking.pkl','models/round2/Hong Kong-Zhuhai-Macao Bridge_stacking.pkl', 'models/round2/Lo Wu_stacking.pkl',  'models/round2/Lok Ma Chau Spur Line_stacking.pkl', 'models/round2/Shenzhen Bay_stacking.pkl']
for (name, processed_data), model_path in zip(processed_data_dict.items(), models_path):
    model = joblib.load(model_path)
    print(forecast_future(processed_data, model, 5))

                 Date     Total  year  month  day  weekday  quarter  \
Date                                                                  
2025-04-10 2025-04-10  58126.12  2025      4   10        3        2   
2025-04-11 2025-04-11  61233.85  2025      4   11        4        2   
2025-04-12 2025-04-12  61217.26  2025      4   12        5        2   
2025-04-13 2025-04-13  67511.91  2025      4   13        6        2   
2025-04-14 2025-04-14  66031.48  2025      4   14        0        2   

            is_weekend  is_holiday     lag_1  ...   lag_60  rolling_7_mean  \
Date                                          ...                            
2025-04-10           0       False  54164.00  ...  77105.0    62847.571429   
2025-04-11           0       False  58126.12  ...  69281.0    62640.160000   
2025-04-12           1       False  61233.85  ...  59967.0    62769.995714   
2025-04-13           1        True  61217.26  ...  56073.0    63232.604286   
2025-04-14           0       False

In [11]:
from statsforecast import StatsForecast
import numpy as np
import pandas as pd

In [14]:
for name, data in data_dict.items():
    train, test = split_data(data)
    forecaster = RollingForecaster(train, test, steps=[3])
    metrics, predictions, ensemble_model = forecaster.run()
    # 选择MAPE最小的模型作为最优模型
    avg_metrics = metrics.mean()  # 计算各模型在不同步长下的平均MAPE
    forecaster.best_model_name = avg_metrics.idxmin()  # 找到MAPE最小的模型名称
    if forecaster.best_model_name == 'Ensemble':
        # 如果是集成模型，需要重新训练（因为每次h的集成模型不同）
        # 这里简化处理，使用最后一个h的集成模型
        forecaster.best_model = ensemble_model
        # 预测未来五天
        model_preds = []
        data.insert(0, 'unique_id', 1)
        for model in forecaster.models:
            fcst = StatsForecast(models=[model], freq='D')
            base_future = fcst.forecast(df=data, h=5, time_col='Date', target_col='Total')
            model_preds.append(base_future[model.__class__.__name__].values)
        # 生成集成预测（新增部分）
        X_stack = np.column_stack(model_preds)
        final_pred = forecaster.best_model.predict(X_stack)
    else:
        # 找到对应的基础模型
        forecaster.best_model = forecaster.models[forecaster.model_names.index(forecaster.best_model_name)]
        final_pred = forecaster.best_model.predict(df=data, h=5, time_col='Date', target_col='Total')
    print(f'{name}: {final_pred}')

MAPE for Hong Kong International Airport:         ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta  Ensemble
5  0.060589  0.069943   0.063276               0.065748  0.024932
Hong Kong International Airport: [55434.02873536 65099.23761481 61339.41088717 61463.95906746
 65864.24286833]
MAPE for Hong Kong-Zhuhai-Macao Bridge:         ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta  Ensemble
5  0.187914  0.141011   0.115637               0.131724  0.059135
Hong Kong-Zhuhai-Macao Bridge: [29837.056144   30551.42689109 42879.49305301 61377.72826389
 36351.24180592]
MAPE for Lo Wu:         ETS   SARIMA  AutoTBATS  DynamicOptimizedTheta  Ensemble
5  0.111423  0.08659   0.086642               0.106261  0.038831
Lo Wu: [ 79038.00970737  84769.60545715 103848.41808333 133781.43255952
  89515.96027804]
MAPE for Lok Ma Chau Spur Line:         ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta  Ensemble
5  0.108602  0.087946   0.084682               0.104796  0.037664
Lok Ma Chau Spur Line: [ 67316.581

In [12]:
for name, data in data_dict.items():
    train, test = split_data(data)
    forecaster = RollingForecaster(train, test, steps=[3])
    metrics, predictions, ensemble_model = forecaster.run()
    # 选择MAPE最小的模型作为最优模型
    avg_metrics = metrics.mean()
    forecaster.best_model_name = avg_metrics.idxmin()

    # 新增预测参数
    total_horizon = 5  # 需要预测的总天数
    current_step = 3   # 从验证时使用的步长

    if forecaster.best_model_name == 'Ensemble':
        forecaster.best_model = ensemble_model
        model_preds = []
        data_with_id = data.copy()
        data_with_id.insert(0, 'unique_id', 1)

        # 分步滚动预测
        remaining = total_horizon
        while remaining > 0:
            h = min(current_step, remaining)
            # 各模型预测当前步长
            step_preds = []
            for model in forecaster.models:
                fcst = StatsForecast(models=[model], freq='D')
                pred = fcst.forecast(df=data_with_id, h=h,
                                   time_col='Date', target_col='Total')
                step_preds.append(pred[model.__class__.__name__].values)

            # 生成集成预测
            X_stack = np.column_stack(step_preds)
            final_pred_step = forecaster.best_model.predict(X_stack)[:h]  # 取实际需要的步长

            # 保存预测结果并更新数据
            model_preds.extend(final_pred_step)
            # 生成新的日期索引
            last_date = data_with_id['Date'].max()
            new_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=h)
            # 创建新数据用于后续预测
            new_data = pd.DataFrame({
                'Date': new_dates,
                'unique_id': 1,
                'Total': final_pred_step
            })
            data_with_id = pd.concat([data_with_id, new_data])
            remaining -= h

        final_pred = np.array(model_preds)[:total_horizon]  # 确保精确5天
    else:
        forecaster.best_model = forecaster.models[forecaster.model_names.index(
            forecaster.best_model_name)]
        # 基础模型滚动预测
        remaining = total_horizon
        final_pred = []
        data_for_pred = data.copy()

        while remaining > 0:
            h = min(current_step, remaining)
            pred = forecaster.best_model.predict(df=data_for_pred, h=h,
                                               time_col='Date', target_col='Total')
            final_pred.extend(pred)
            # 更新数据用于下次预测
            last_date = data_for_pred['Date'].max()
            new_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=h)
            new_data = pd.DataFrame({
                'Date': new_dates,
                'Total': pred
            })
            data_for_pred = pd.concat([data_for_pred, new_data])
            remaining -= h

        final_pred = np.array(final_pred)[:total_horizon]

    print(f'{name}: {final_pred}')

Hong Kong International Airport: [58202.5824246  61979.9447619  62335.12309524 74546.83961111
 65249.42132143]
Hong Kong-Zhuhai-Macao Bridge: [28472.62615873 30921.7526206  41764.33256241 57068.64814286
 35292.66769841]
Lo Wu: [ 81204.16914826  84351.76845702 104213.90515476 129298.20061544
  91448.22364653]
Lok Ma Chau Spur Line: [ 66934.66421598  70140.99262765  92007.81601587 104477.28826804
  71202.04012227]
Shenzhen Bay: [48153.98743894 59950.14783126 74644.57615079 90985.23837302
 58227.83625794]
