In [1]:
from src.future_forecast import forecast_future
from src.preprocessing import load_data, split_data, split_processed_data
from src.forecasting import RollingForecaster
from src.feature_engineering import create_features
from src.model_fit import train_models

import warnings
warnings.filterwarnings("ignore")

In [2]:
data_dict = load_data('data/round_1/*.xlsx')

In [3]:
data_dict

{'Hong Kong International Airport':           Date  Total
 0   2023-01-08  30275
 1   2023-01-09  24796
 2   2023-01-10  24012
 3   2023-01-11  23273
 4   2023-01-12  23053
 ..         ...    ...
 817 2025-04-04  60325
 818 2025-04-05  57979
 819 2025-04-06  78654
 820 2025-04-07  68436
 821 2025-04-08  60797
 
 [822 rows x 2 columns],
 'Hong Kong-Zhuhai-Macao Bridge':           Date  Total
 0   2023-01-08   5321
 1   2023-01-09   4937
 2   2023-01-10   5617
 3   2023-01-11   6191
 4   2023-01-12   6699
 ..         ...    ...
 817 2025-04-04  52496
 818 2025-04-05  57120
 819 2025-04-06  85363
 820 2025-04-07  33758
 821 2025-04-08  28809
 
 [822 rows x 2 columns],
 'Lo Wu':           Date   Total
 0   2023-02-06   30319
 1   2023-02-07   32954
 2   2023-02-08   35022
 3   2023-02-09   34764
 4   2023-02-10   39235
 ..         ...     ...
 788 2025-04-04  128351
 789 2025-04-05  114026
 790 2025-04-06  137628
 791 2025-04-07   92555
 792 2025-04-08   82638
 
 [793 rows x 2 columns],
 '

In [4]:
processed_data_dict = {}
for name, data in data_dict.items():
    processed_data = create_features(data)
    processed_data_dict[name] = processed_data

In [5]:
processed_data_dict

{'Hong Kong International Airport':           Date  Total  year  month  day  weekday  quarter  is_weekend  \
 60  2023-03-09  31792  2023      3    9        3        1           0   
 61  2023-03-10  33559  2023      3   10        4        1           0   
 62  2023-03-11  33421  2023      3   11        5        1           1   
 63  2023-03-12  37163  2023      3   12        6        1           1   
 64  2023-03-13  35531  2023      3   13        0        1           0   
 ..         ...    ...   ...    ...  ...      ...      ...         ...   
 817 2025-04-04  60325  2025      4    4        4        2           0   
 818 2025-04-05  57979  2025      4    5        5        2           1   
 819 2025-04-06  78654  2025      4    6        6        2           1   
 820 2025-04-07  68436  2025      4    7        0        2           0   
 821 2025-04-08  60797  2025      4    8        1        2           0   
 
      is_holiday    lag_1  ...   lag_60  rolling_7_mean  rolling_7_std  \
 

In [6]:
data_dict

{'Hong Kong International Airport':           Date  Total
 0   2023-01-08  30275
 1   2023-01-09  24796
 2   2023-01-10  24012
 3   2023-01-11  23273
 4   2023-01-12  23053
 ..         ...    ...
 817 2025-04-04  60325
 818 2025-04-05  57979
 819 2025-04-06  78654
 820 2025-04-07  68436
 821 2025-04-08  60797
 
 [822 rows x 2 columns],
 'Hong Kong-Zhuhai-Macao Bridge':           Date  Total
 0   2023-01-08   5321
 1   2023-01-09   4937
 2   2023-01-10   5617
 3   2023-01-11   6191
 4   2023-01-12   6699
 ..         ...    ...
 817 2025-04-04  52496
 818 2025-04-05  57120
 819 2025-04-06  85363
 820 2025-04-07  33758
 821 2025-04-08  28809
 
 [822 rows x 2 columns],
 'Lo Wu':           Date   Total
 0   2023-02-06   30319
 1   2023-02-07   32954
 2   2023-02-08   35022
 3   2023-02-09   34764
 4   2023-02-10   39235
 ..         ...     ...
 788 2025-04-04  128351
 789 2025-04-05  114026
 790 2025-04-06  137628
 791 2025-04-07   92555
 792 2025-04-08   82638
 
 [793 rows x 2 columns],
 '

In [7]:
import joblib
import os
os.makedirs('models/round1', exist_ok=True)

In [8]:
for name, processed_data in processed_data_dict.items():
    X_train, X_test, y_train, y_test = split_processed_data(processed_data)
    best_models, stacking, mape = train_models(X_train, y_train, X_test, y_test)
    print(f"Best models for {name}: {best_models}")
    print(f"Stacking model for {name}: {stacking}")
    print(mape)
        # 找到当前口岸MAPE最小的模型
    best_model_name = min(mape, key=mape.get)
    # 保存最优模型（包括stacking）
    if best_model_name == 'stacking':
        best_model = stacking
    else:
        best_model = best_models[best_model_name]
    # 生成带口岸名称的文件名
    filename = f"models/round1/{name}_best_{best_model_name}.pkl"
    joblib.dump(best_model, filename)
    # 额外单独保存stacking模型
    stacking_filename = f"models/round1/{name}_stacking.pkl"
    joblib.dump(stacking, stacking_filename)
    print(f"{name} 最佳模型已保存：{filename} (MAPE={mape[best_model_name]:.4f})")

KeyboardInterrupt: 

In [9]:
models_path = ['models/round1/Hong Kong International Airport_best_catboost.pkl','models/round1/Hong Kong-Zhuhai-Macao Bridge_best_catboost.pkl', 'models/round1/Lo Wu_best_lgbm.pkl',  'models/round1/Lok Ma Chau Spur Line_best_catboost.pkl']
for (name, processed_data), model_path in zip(processed_data_dict.items(), models_path):
    model = joblib.load(model_path)
    print(forecast_future(processed_data, model, 5))

                 Date         Total  year  month  day  weekday  quarter  \
Date                                                                      
2025-04-09 2025-04-09  64872.672344  2025      4    9        2        2   
2025-04-10 2025-04-10  72705.776567  2025      4   10        3        2   
2025-04-11 2025-04-11  79626.001357  2025      4   11        4        2   
2025-04-12 2025-04-12  80174.403869  2025      4   12        5        2   
2025-04-13 2025-04-13  78104.862701  2025      4   13        6        2   

            is_weekend  is_holiday         lag_1  ...   lag_60  \
Date                                              ...            
2025-04-09           0       False  60797.000000  ...  70249.0   
2025-04-10           0       False  64872.672344  ...  77105.0   
2025-04-11           0       False  72705.776567  ...  69281.0   
2025-04-12           1       False  79626.001357  ...  59967.0   
2025-04-13           1        True  80174.403869  ...  56073.0   

           

In [8]:
from statsforecast import StatsForecast
import numpy as np

In [17]:
for name, data in data_dict.items():
    train, test = split_data(data)
    forecaster = RollingForecaster(train, test, steps=[5])
    metrics, predictions, ensemble_model = forecaster.run()
    print(f"MAPE for {name}: {metrics}")
    # 选择MAPE最小的模型作为最优模型
    avg_metrics = metrics.mean()  # 计算各模型在不同步长下的平均MAPE
    forecaster.best_model_name = avg_metrics.idxmin()  # 找到MAPE最小的模型名称
    if forecaster.best_model_name == 'Ensemble':
        # 如果是集成模型，需要重新训练（因为每次h的集成模型不同）
        # 这里简化处理，使用最后一个h的集成模型
        forecaster.best_model = ensemble_model
        # 预测未来五天
        model_preds = []
        data.insert(0, 'unique_id', 1)
        for model in forecaster.models:
            fcst = StatsForecast(models=[model], freq='D')
            base_future = fcst.forecast(df=data, h=5, time_col='Date', target_col='Total')
            model_preds.append(base_future[model.__class__.__name__].values)
        # 生成集成预测（新增部分）
        X_stack = np.column_stack(model_preds)
        final_pred = forecaster.best_model.predict(X_stack)
    else:
        # 找到对应的基础模型
        forecaster.best_model = forecaster.models[forecaster.model_names.index(forecaster.best_model_name)]
        final_pred = forecaster.best_model.predict(df=data, h=5, time_col='Date', target_col='Total')
    print(f'{name}: {final_pred}')

MAPE for Hong Kong International Airport:         ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta  Ensemble
5  0.054363  0.065719   0.064407               0.060498  0.025437
Hong Kong International Airport: [65650.49749164 67850.65719444 73783.88344156 72369.33663781
 74852.56161905]
MAPE for Hong Kong-Zhuhai-Macao Bridge:         ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta  Ensemble
5  0.176969  0.135651    0.11284               0.130564  0.069051
Hong Kong-Zhuhai-Macao Bridge: [29316.7107657  28855.09441288 35771.00339438 37831.29619048
 59154.73315368]
MAPE for Lo Wu:         ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta  Ensemble
5  0.114359  0.088011   0.093226               0.107999    0.0425
Lo Wu: [ 80033.64343899  78767.54650761  78861.47065602  97517.93509632
 123448.51913889]
MAPE for Lok Ma Chau Spur Line:         ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta  Ensemble
5  0.094359  0.082612   0.078478               0.088652  0.042593
Lok Ma Chau Spur Line: [ 74790.2