In [9]:
from src.model_predict import forecast_future
from src.preprocessing import load_data, split_data, split_processed_data
from src.forecasting import RollingForecaster
from src.feature_engineering import create_features
from src.model_fit import train_models

import warnings
warnings.filterwarnings("ignore")

In [10]:
data_dict = load_data('data/round_4/*.xlsx')

In [11]:
data_dict

{'Hong Kong International Airport':           Date  Total
 0   2023-01-08  30275
 1   2023-01-09  24796
 2   2023-01-10  24012
 3   2023-01-11  23273
 4   2023-01-12  23053
 ..         ...    ...
 820 2025-04-07  68436
 821 2025-04-08  60797
 822 2025-04-09  54164
 823 2025-04-10  55832
 824 2025-04-11  64252
 
 [825 rows x 2 columns],
 'Hong Kong-Zhuhai-Macao Bridge':           Date  Total
 0   2023-01-08   5321
 1   2023-01-09   4937
 2   2023-01-10   5617
 3   2023-01-11   6191
 4   2023-01-12   6699
 ..         ...    ...
 820 2025-04-07  33758
 821 2025-04-08  28809
 822 2025-04-09  28309
 823 2025-04-10  28153
 824 2025-04-11  31023
 
 [825 rows x 2 columns],
 'Lo Wu':           Date  Total
 0   2023-02-06  30319
 1   2023-02-07  32954
 2   2023-02-08  35022
 3   2023-02-09  34764
 4   2023-02-10  39235
 ..         ...    ...
 791 2025-04-07  92555
 792 2025-04-08  82638
 793 2025-04-09  81359
 794 2025-04-10  80281
 795 2025-04-11  82676
 
 [796 rows x 2 columns],
 'Lok Ma Chau 

In [12]:
processed_data_dict = {}
df_dict = {}
for name, data in data_dict.items():
    df ,processed_data = create_features(data)
    df_dict[name] = df
    processed_data_dict[name] = processed_data

日期超出范围警告: 2026-04-09 00:00:00 - no available data for year 2026, only year between [2004, 2025] supported


In [13]:
processed_data_dict

{'Hong Kong International Airport':           Date  Total  year  month  day  weekday  quarter  is_weekend  \
 61  2023-03-10  33559  2023      3   10        4        1           0   
 62  2023-03-11  33421  2023      3   11        5        1           1   
 63  2023-03-12  37163  2023      3   12        6        1           1   
 64  2023-03-13  35531  2023      3   13        0        1           0   
 65  2023-03-14  30323  2023      3   14        1        1           0   
 ..         ...    ...   ...    ...  ...      ...      ...         ...   
 820 2025-04-07  68436  2025      4    7        0        2           0   
 821 2025-04-08  60797  2025      4    8        1        2           0   
 822 2025-04-09  54164  2025      4    9        2        2           0   
 823 2025-04-10  55832  2025      4   10        3        2           0   
 824 2025-04-11  64252  2025      4   11        4        2           0   
 
      is_hk_holiday  is_cn_holiday  ...  rolling_30_mean  ma_ratio_7_30  \


In [14]:
df_dict

{'Hong Kong International Airport':           Date  Total      ex  year  month  day  weekday  quarter  is_weekend  \
 61  2023-03-10  33559  0.8797  2023      3   10        4        1           0   
 62  2023-03-11  33421  0.8797  2023      3   11        5        1           1   
 63  2023-03-12  37163  0.8797  2023      3   12        6        1           1   
 64  2023-03-13  35531  0.8727  2023      3   13        0        1           0   
 65  2023-03-14  30323  0.8751  2023      3   14        1        1           0   
 ..         ...    ...     ...   ...    ...  ...      ...      ...         ...   
 820 2025-04-07  68436  0.9408  2025      4    7        0        2           0   
 821 2025-04-08  60797  0.9444  2025      4    8        1        2           0   
 822 2025-04-09  54164  0.9444  2025      4    9        2        2           0   
 823 2025-04-10  55832  0.9444  2025      4   10        3        2           0   
 824 2025-04-11  64252  0.9444  2025      4   11        4      

In [15]:
import numpy as np
import joblib
import os
os.makedirs('models/round4', exist_ok=True)

In [16]:
for name, processed_data in processed_data_dict.items():
    X_train, X_test, y_train, y_test = split_processed_data(processed_data)

    # 第一轮训练获取特征重要性
    best_models, stacking, mape = train_models(X_train, y_train, X_test, y_test)

    # 获取最优模型
    best_model_name = min(mape, key=mape.get)
    if best_model_name == 'stacking':
        best_model = stacking
    else:
        best_model = best_models[best_model_name]

    # 特征重要性筛选（修改部分）
    if best_model_name == 'stacking':
        # 使用第一个基模型的特征重要性
        base_model = best_model.estimators_[0]
        importance = base_model.feature_importances_
        features = base_model.feature_names_in_
    elif hasattr(best_model, 'feature_importances_'):
        importance = best_model.feature_importances_
        # 兼容不同模型的特征名称属性
        if hasattr(best_model, 'feature_names_in_'):  # sklearn
            features = best_model.feature_names_in_
        elif hasattr(best_model, 'feature_name_'):  # LightGBM
            features = best_model.feature_name_
        elif hasattr(best_model, 'feature_names_'):  # CatBoost
            features = best_model.feature_names_
        else:
            features = X_train.columns.tolist()
    elif hasattr(best_model, 'get_feature_importance'):
        importance = best_model.get_feature_importance()
        features = best_model.feature_names_
    else:
        features = X_train.columns.tolist()
        importance = np.ones(len(features))

    # 设置动态阈值（取前95%重要性的特征）
    sorted_idx = np.argsort(importance)[::-1]
    cumulative = np.cumsum(importance[sorted_idx])
    threshold_idx = np.where(cumulative >= 0.98 * cumulative[-1])[0][0]
    min_features = 10  # 最小保留特征数
    selected_features = [features[i] for i in sorted_idx[:max(threshold_idx+1, min_features)]]
    # 筛选特征后重新训练
    if len(selected_features) < len(features):
        print(f"{name} 特征筛选: 从 {len(features)} 个特征中保留 {len(selected_features)} 个")
        X_train_selected = X_train[selected_features]
        X_test_selected = X_test[selected_features]

        # 用筛选后的特征重新训练
        best_models, stacking, mape = train_models(X_train_selected, y_train, X_test_selected, y_test)
        best_model_name = min(mape, key=mape.get)
        if best_model_name == 'stacking':
            best_model = stacking
        else:
            best_model = best_models[best_model_name]

    # 保存模型
    filename = f"models/round4/{name}_best_{best_model_name}.pkl"
    # 保存模型和特征信息
    joblib.dump({
        'model': best_model,
        'features': selected_features if 'selected_features' in locals() else X_train.columns.tolist()
    },
        filename)
    stacking_filename = f"models/round4/{name}_stacking.pkl"
    joblib.dump(stacking, stacking_filename)
    print(f"{name} 最佳模型已保存：{filename} (MAPE={mape[best_model_name]:.4f})")

Hong Kong International Airport 特征筛选: 从 41 个特征中保留 36 个
Hong Kong International Airport 最佳模型已保存：models/round4/Hong Kong International Airport_best_catboost.pkl (MAPE=0.0503)
Hong Kong-Zhuhai-Macao Bridge 特征筛选: 从 41 个特征中保留 35 个
Hong Kong-Zhuhai-Macao Bridge 最佳模型已保存：models/round4/Hong Kong-Zhuhai-Macao Bridge_best_catboost.pkl (MAPE=0.0839)
Lo Wu 特征筛选: 从 41 个特征中保留 32 个
Lo Wu 最佳模型已保存：models/round4/Lo Wu_best_stacking.pkl (MAPE=0.0827)
Lok Ma Chau Spur Line 特征筛选: 从 41 个特征中保留 35 个
Lok Ma Chau Spur Line 最佳模型已保存：models/round4/Lok Ma Chau Spur Line_best_catboost.pkl (MAPE=0.0785)
Shenzhen Bay 特征筛选: 从 41 个特征中保留 36 个
Shenzhen Bay 最佳模型已保存：models/round4/Shenzhen Bay_best_catboost.pkl (MAPE=0.0941)


In [23]:
models_path = ['models/round4/Hong Kong International Airport_best_catboost.pkl',
              'models/round4/Hong Kong-Zhuhai-Macao Bridge_best_catboost.pkl',
              'models/round4/Lo Wu_best_stacking.pkl',
              'models/round4/Lok Ma Chau Spur Line_best_catboost.pkl',
              'models/round4/Shenzhen Bay_best_catboost.pkl']

In [24]:
# 修改预测代码
for (name,df), model_path in zip(df_dict.items(), models_path):
    saved_data = joblib.load(model_path)
    model = saved_data['model']
    required_features = saved_data['features']
    print(name , forecast_future(df, model, 5, required_features)['Total'])

Hong Kong International Airport Date
2025-04-12    61208.568957
2025-04-13    69224.507203
2025-04-14    64748.726469
2025-04-15    56794.726875
2025-04-16    57931.161487
Name: Total, dtype: float64
Hong Kong-Zhuhai-Macao Bridge Date
2025-04-12    40448.708258
2025-04-13    63988.245807
2025-04-14    41964.972504
2025-04-15    30036.957906
2025-04-16    28802.390404
Name: Total, dtype: float64
Lo Wu Date
2025-04-12    107729.227288
2025-04-13    133895.316145
2025-04-14     91491.627270
2025-04-15     80107.685255
2025-04-16     79961.832655
Name: Total, dtype: float64
Lok Ma Chau Spur Line Date
2025-04-12     93166.338413
2025-04-13    109496.750411
2025-04-14     74724.425787
2025-04-15     67924.883855
2025-04-16     64191.542421
Name: Total, dtype: float64
Shenzhen Bay Date
2025-04-12    67162.316407
2025-04-13    90512.807945
2025-04-14    53417.879024
2025-04-15    47088.128824
2025-04-16    48128.522687
Name: Total, dtype: float64


In [25]:
from statsforecast import StatsForecast
import numpy as np
import pandas as pd

In [26]:
step_list = [1, 1, 1, 3, 1]
result_dict = {}  # 用于存储所有预测结果

In [27]:
def process_portfolio(name, data, step):
    train, test = split_data(data)
    forecaster = RollingForecaster(train, test, steps=[step])
    metrics, predictions, ensemble_model = forecaster.run()

    # 选择最优模型
    avg_metrics = metrics.mean()
    forecaster.best_model_name = avg_metrics.idxmin()

    # 返回必要对象用于后续预测
    return {
        'name': name,
        'forecaster': forecaster,
        'ensemble_model': ensemble_model,
        'metrics': metrics,
        'data': data,
        'step': step
    }

In [28]:
for (name, data), step in zip(data_dict.items(), step_list):
    result = process_portfolio(name, data, step)
    result_dict[name] = result
    print(f"{name} 模型训练完成")
    print(f'MAPE: {result["metrics"]}')

Hong Kong International Airport 模型训练完成
MAPE:       ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta      MSTL  Ensemble
1  0.0453  0.052682   0.051067               0.049523  0.042989  0.049721
Hong Kong-Zhuhai-Macao Bridge 模型训练完成
MAPE:         ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta      MSTL  Ensemble
1  0.119908  0.097976   0.088274               0.092222  0.121756  0.148464
Lo Wu 模型训练完成
MAPE:         ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta      MSTL  Ensemble
1  0.088341  0.086154   0.075257               0.080604  0.081791  0.090242
Lok Ma Chau Spur Line 模型训练完成
MAPE:        ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta      MSTL  Ensemble
3  0.09979  0.106409   0.091584               0.088334  0.104209  0.106718
Shenzhen Bay 模型训练完成
MAPE:        ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta      MSTL  Ensemble
1  0.12274  0.098651   0.094143               0.104058  0.099442  0.104684


In [29]:
def rolling_predict(data, model, days, horizon):
    predictions= [] # 定义预测结果
    # 循环预测
    for i in range(0, days, horizon):
        fcst = StatsForecast(
            models=[model],
            freq='D', # 时间频率(日)
        )
        forecast = fcst.forecast(
            df = data,
            h=horizon,
            time_col='Date',
            target_col='Total'
        )
        # 记录结果
        pred = forecast[model.__class__.__name__].values[:min(days-i, horizon)]
        predictions.extend(pred)
        pred_df = pd.DataFrame({
            'Date': pd.date_range(start=data['Date'].iloc[-1], periods=len(pred) + 1, freq='D')[1:],
            'Total': pred
        })
        data = pd.concat([data, pred_df])
    return predictions

In [30]:
total_horizon = 5  # 需要预测的总天数

for name, res in result_dict.items():
    # 解包存储的结果
    forecaster = res['forecaster']
    data = res['data']
    step = res['step']

    current_step = step   # 从验证时使用的步长
    metrics_min = res['metrics'].loc[current_step].min()
    data_with_id = data.copy()
    data_with_id.insert(0, 'unique_id', 1)
    if forecaster.best_model_name == 'Ensemble':
        ensemble_model = res['ensemble_model']
        models_preds = []
        remaining = total_horizon
        for model in forecaster.models:
            models_preds.append(rolling_predict(data_with_id, model, total_horizon, current_step))
        X_stack = np.column_stack(models_preds)
        final_pred = ensemble_model.predict(X_stack)[:total_horizon]
    else:
        best_model = forecaster.models[forecaster.model_names.index(forecaster.best_model_name)]
        final_pred = forecaster.rolling_predict(data_with_id, best_model, total_horizon, current_step)
    print(f'{name}: 未来5天预测结果{final_pred} ')
    print(f'MAPE: {metrics_min} MODEL: {forecaster.best_model_name}')

Hong Kong International Airport: 未来5天预测结果[61333.895, 61333.893614439556, 61333.893614439556, 61333.893614439556, 61333.893614439556] 
MAPE: 0.042989340844492595 MODEL: MSTL
Hong Kong-Zhuhai-Macao Bridge: 未来5天预测结果[39707.42, 38817.16789435509, 38817.16789435509, 38817.16789435509, 38817.16789435509] 
MAPE: 0.08827384891311528 MODEL: AutoTBATS
Lo Wu: 未来5天预测结果[99307.75, 99304.158592681, 99304.158592681, 99304.158592681, 99304.158592681] 
MAPE: 0.07525674907594873 MODEL: AutoTBATS
Lok Ma Chau Spur Line: 未来5天预测结果[88211.3, 103819.664, 72493.734, 88211.29787639223, 103819.66379185294] 
MAPE: 0.08833420960487548 MODEL: DynamicOptimizedTheta
Shenzhen Bay: 未来5天预测结果[65027.832, 65516.36866947802, 65516.36866947802, 65516.36866947802, 65516.36866947802] 
MAPE: 0.09414306522099793 MODEL: AutoTBATS
