In [7]:
from src.model_predict import forecast_future
from src.preprocessing import load_data, split_processed_data
from src.forecasting import process_portfolio, rolling_predict
from src.feature_engineering import create_features
from src.model_fit import train_models
import numpy as np
import joblib
import os
os.makedirs('models/round7_full', exist_ok=True)
os.makedirs('models/round7', exist_ok=True)
os.makedirs('models/round7_simple', exist_ok=True)
import warnings
warnings.filterwarnings("ignore")

In [8]:
data_dict = load_data('data/round_7/*.xlsx')

In [9]:
data_dict

{'Hong Kong International Airport':           Date  Total
 0   2023-01-08  30275
 1   2023-01-09  24796
 2   2023-01-10  24012
 3   2023-01-11  23273
 4   2023-01-12  23053
 ..         ...    ...
 823 2025-04-10  55832
 824 2025-04-11  64252
 825 2025-04-12  62073
 826 2025-04-13  68078
 827 2025-04-14  64296
 
 [828 rows x 2 columns],
 'Hong Kong-Zhuhai-Macao Bridge':           Date  Total
 0   2023-01-08   5321
 1   2023-01-09   4937
 2   2023-01-10   5617
 3   2023-01-11   6191
 4   2023-01-12   6699
 ..         ...    ...
 823 2025-04-10  28153
 824 2025-04-11  31023
 825 2025-04-12  34587
 826 2025-04-13  55886
 827 2025-04-14  36194
 
 [828 rows x 2 columns],
 'Lo Wu':           Date   Total
 0   2023-02-06   30319
 1   2023-02-07   32954
 2   2023-02-08   35022
 3   2023-02-09   34764
 4   2023-02-10   39235
 ..         ...     ...
 794 2025-04-10   80281
 795 2025-04-11   82676
 796 2025-04-12   81797
 797 2025-04-13  119651
 798 2025-04-14   87925
 
 [799 rows x 2 columns],
 '

In [10]:
processed_data_dict = {}
df_dict = {}
for name, data in data_dict.items():
    df ,processed_data = create_features(data)
    df_dict[name] = df
    processed_data_dict[name] = processed_data

In [11]:
processed_data_dict['Lo Wu'].tail()

Unnamed: 0,Date,Total,airport_sum,airport_pc,airport_mobile,bay_sum,bay_pc,bay_mobile,bridge_sum,bridge_pc,...,ex_rate_volatility_7d,rolling_7_mean,rolling_7_std,rolling_30_mean,ma_ratio_7_30,rolling_7_ex_rate_mean,month_sin,days_since_start,days_squared,weekly_growth
794,2025-04-10,80281,263,89,174,221,85,136,1533,347,...,0.00138,101769.142857,24776.174963,92987.233333,1.094442,0.934414,0.866025,794,630436,0.952972
795,2025-04-11,82676,272,91,181,254,87,167,1650,341,...,0.001536,102405.428571,24045.320285,92956.0,1.101655,0.934814,0.866025,795,632025,1.565046
796,2025-04-12,81797,257,76,181,234,74,160,1516,279,...,0.001561,95880.428571,21935.940318,93117.033333,1.029677,0.935214,0.866025,796,633616,1.144196
797,2025-04-13,119651,256,76,180,233,68,165,1410,234,...,0.002241,91276.285714,20847.890155,93166.2,0.979715,0.936371,0.866025,797,635209,1.062043
798,2025-04-14,87925,280,116,164,306,127,179,1546,391,...,0.003553,88708.142857,14249.459831,94429.033333,0.939416,0.937814,0.866025,798,636804,1.119626


In [14]:
df_dict['Lo Wu'].tail()

Unnamed: 0,Date,Total,ex,airport_sum,airport_pc,airport_mobile,bay_sum,bay_pc,bay_mobile,bridge_sum,...,ex_rate_volatility_7d,rolling_7_mean,rolling_7_std,rolling_30_mean,ma_ratio_7_30,rolling_7_ex_rate_mean,month_sin,days_since_start,days_squared,weekly_growth
794,2025-04-10,80281,0.9428,263,89,174,221,85,136,1533,...,0.00138,101769.142857,24776.174963,92987.233333,1.094442,0.934414,0.866025,794,630436,0.952972
795,2025-04-11,82676,0.9429,272,91,181,254,87,167,1650,...,0.001536,102405.428571,24045.320285,92956.0,1.101655,0.934814,0.866025,795,632025,1.565046
796,2025-04-12,81797,0.9429,257,76,181,234,74,160,1516,...,0.001561,95880.428571,21935.940318,93117.033333,1.029677,0.935214,0.866025,796,633616,1.144196
797,2025-04-13,119651,0.9429,256,76,180,233,68,165,1410,...,0.002241,91276.285714,20847.890155,93166.2,0.979715,0.936371,0.866025,797,635209,1.062043
798,2025-04-14,87925,0.9429,280,116,164,306,127,179,1546,...,0.003553,88708.142857,14249.459831,94429.033333,0.939416,0.937814,0.866025,798,636804,1.119626


In [15]:
full_models = []
for name, processed_data in list(processed_data_dict.items()):
    X_train, X_test, y_train, y_test = split_processed_data(processed_data)

    # 训练模型
    best_models, stacking, mape = train_models(X_train, y_train, X_test, y_test)

    # 获取最优模型
    best_model_name = min(mape, key=mape.get)
    if best_model_name == 'stacking':
        best_model = stacking
    else:
        best_model = best_models[best_model_name]

    # 保存模型
    filename = f"models/round7_full/{name}_best_{best_model_name}.pkl"
    joblib.dump({
        'model': best_model,
        'features': X_train.columns.tolist(),
        'name': name,
        'mape': mape[best_model_name]
    },
    filename)
    full_models.append(filename)
    stacking_filename = f"models/round7_full/{name}_stacking.pkl"
    joblib.dump(stacking, stacking_filename)
    print(f"{name} 最佳模型已保存：{filename} (MAPE={mape[best_model_name]:.4f})")

Hong Kong International Airport 最佳模型已保存：models/round7_full/Hong Kong International Airport_best_catboost.pkl (MAPE=0.0485)
Hong Kong-Zhuhai-Macao Bridge 最佳模型已保存：models/round7_full/Hong Kong-Zhuhai-Macao Bridge_best_catboost.pkl (MAPE=0.0804)
Lo Wu 最佳模型已保存：models/round7_full/Lo Wu_best_catboost.pkl (MAPE=0.0751)
Lok Ma Chau Spur Line 最佳模型已保存：models/round7_full/Lok Ma Chau Spur Line_best_catboost.pkl (MAPE=0.0767)
Shenzhen Bay 最佳模型已保存：models/round7_full/Shenzhen Bay_best_catboost.pkl (MAPE=0.0955)


In [16]:
models = []
for name, processed_data in list(processed_data_dict.items()):
    X_train, X_test, y_train, y_test = split_processed_data(processed_data)

    # 第一轮训练获取特征重要性
    best_models, stacking, mape = train_models(X_train, y_train, X_test, y_test)

    # 获取最优模型
    best_model_name = min(mape, key=mape.get)
    if best_model_name == 'stacking':
        best_model = stacking
    else:
        best_model = best_models[best_model_name]

    # 特征重要性筛选（修改部分）
    if best_model_name == 'stacking':
        # 使用第一个基模型的特征重要性
        base_model = best_model.estimators_[0]
        importance = base_model.feature_importances_
        features = base_model.feature_names_in_
    elif hasattr(best_model, 'feature_importances_'):
        importance = best_model.feature_importances_
        # 兼容不同模型的特征名称属性
        if hasattr(best_model, 'feature_names_in_'):  # sklearn
            features = best_model.feature_names_in_
        elif hasattr(best_model, 'feature_name_'):  # LightGBM
            features = best_model.feature_name_
        elif hasattr(best_model, 'feature_names_'):  # CatBoost
            features = best_model.feature_names_
        else:
            features = X_train.columns.tolist()
    elif hasattr(best_model, 'get_feature_importance'):
        importance = best_model.get_feature_importance()
        features = best_model.feature_names_
    else:
        features = X_train.columns.tolist()
        importance = np.ones(len(features))

    # 设置动态阈值
    sorted_idx = np.argsort(importance)[::-1]
    cumulative = np.cumsum(importance[sorted_idx])
    threshold_idx = np.where(cumulative >= 0.9 * cumulative[-1])[0][0]
    min_features = 20  # 最小保留特征数
    selected_features = [features[i] for i in sorted_idx[:max(threshold_idx+1, min_features)]]
    print(f'{name} selected features:', selected_features)
    # 筛选特征后重新训练
    if len(selected_features) < len(features):
        print(f"{name} 特征筛选: 从 {len(features)} 个特征中保留 {len(selected_features)} 个")
        X_train_selected = X_train[selected_features]
        X_test_selected = X_test[selected_features]

        # 用筛选后的特征重新训练
        best_models, stacking, mape = train_models(X_train_selected, y_train, X_test_selected, y_test)
        best_model_name = min(mape, key=mape.get)
        if best_model_name == 'stacking':
            best_model = stacking
        else:
            best_model = best_models[best_model_name]

    # 保存模型
    filename = f"models/round7/{name}_best_{best_model_name}.pkl"
    # 保存模型和特征信息
    joblib.dump({
        'model': best_model,
        'features': selected_features if 'selected_features' in locals() else X_train.columns.tolist(),
        'name': name,
        'mape': mape[best_model_name]
    },
        filename)
    models.append(filename)
    stacking_filename = f"models/round7/{name}_stacking.pkl"
    joblib.dump(stacking, stacking_filename)
    print(f"{name} 最佳模型已保存：{filename} (MAPE={mape[best_model_name]:.4f})")

Hong Kong International Airport selected features: ['lag_1', 'rolling_7_mean', 'is_hk_holiday', 'days_since_start', 'ma_ratio_7_30', 'weekday', 'lag_7', 'rolling_30_mean', 'post_cn_holiday', 'lag_21', 'airport_pc', 'ex_rate_lag_5', 'ex_rate_lag_7', 'lowu_pc', 'year', 'bay_sum', 'lag_4', 'lag_2', 'hk_weather_mobile_lag_7', 'month_sin', 'lowu_sum', 'days_squared', 'hk_map_pc_lag_7', 'lowu_mobile', 'rolling_7_std', 'bridge_pc', 'hk_show_mobile_lag_5', 'day', 'hk_weather_sum_lag_7', 'lag_5', 'hk_pc_lag_7', 'hk_map_sum_lag_14', 'hk_hotel_sum_lag_7', 'hk_weekend_holiday', 'hk_shopping_pc_lag_14', 'bay_mobile', 'post_hk_holiday', 'hk_food_pc_lag_5', 'pc_mobile_lag_14', 'bridge_mobile', 'hk_map_sum_lag_5', 'hk_show_pc_lag_5', 'hk_pc_lag_5', 'hk_weather_sum_lag_5', 'month', 'airport_mobile', 'rain', 'hk_weather_pc_lag_14', 'hk_food_sum_lag_7', 'bay_pc', 'total_change_7d', 'hk_shopping_sum_lag_5']
Hong Kong International Airport 特征筛选: 从 134 个特征中保留 52 个
Hong Kong International Airport 最佳模型已保存：mod

In [19]:
for (name,df), model_path in zip(df_dict.items(), models):
    saved_data = joblib.load(model_path)
    model = saved_data['model']
    required_features = saved_data['features']
    mape = saved_data['mape']
    print(forecast_future(df, model, 5, required_features)['Total'], name, mape)

Date
2025-04-15    59866.717483
2025-04-16    58689.693194
2025-04-17    60073.406073
2025-04-18    67544.324198
2025-04-19    71528.703706
Name: Total, dtype: float64 Hong Kong International Airport 0.047374445032239
Date
2025-04-15    31363.481325
2025-04-16    29560.004283
2025-04-17    31908.755642
2025-04-18    49380.819498
2025-04-19    68646.891221
Name: Total, dtype: float64 Hong Kong-Zhuhai-Macao Bridge 0.07367136158537427
Date
2025-04-15     81096.948908
2025-04-16     81477.244247
2025-04-17     81124.391693
2025-04-18    109681.755307
2025-04-19    134969.488951
Name: Total, dtype: float64 Lo Wu 0.07285770778817846
Date
2025-04-15     63856.312069
2025-04-16     64704.526368
2025-04-17     64640.325405
2025-04-18    100526.221391
2025-04-19    101560.576060
Name: Total, dtype: float64 Lok Ma Chau Spur Line 0.07012339667752591
Date
2025-04-15    48160.448394
2025-04-16    47330.712695
2025-04-17    50640.493918
2025-04-18    71420.265087
2025-04-19    82567.195328
Name: Tota

In [21]:
step_list = [1, 1, 1, 3, 1]
result_dict = {}  # 用于存储所有预测结果
for (name, data), step in zip(data_dict.items(), step_list):
    if name == 'Hong Kong International Airport' or name == 'Lo Wu':
        result = process_portfolio(name, data, step)
        result_dict[name] = result
        print(f"{name} 模型训练完成")
        print(f'MAPE: {result["metrics"]}')
    else:
        pass

Hong Kong International Airport 模型训练完成
MAPE:         ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta      MSTL  Ensemble
1  0.040408  0.048045     0.0455               0.043696  0.040081  0.065324
Lo Wu 模型训练完成
MAPE:        ETS    SARIMA  AutoTBATS  DynamicOptimizedTheta      MSTL  Ensemble
1  0.08068  0.069972   0.068026               0.075337  0.075878  0.094735


In [22]:
total_horizon = 5  # 需要预测的总天数

for name, res in result_dict.items():
    # 解包存储的结果
    forecaster = res['forecaster']
    data = res['data']
    step = res['step']
    current_step = step   # 从验证时使用的步长
    metrics_min = res['metrics'].loc[current_step].min()
    data_with_id = data.copy()
    data_with_id.insert(0, 'unique_id', 1)
    if forecaster.best_model_name == 'Ensemble':
        ensemble_model = res['ensemble_model']
        models_preds = []
        for model in forecaster.models:
            model_pred = rolling_predict(data_with_id, model, days=total_horizon, horizon=current_step)
            print(model_pred)
            models_preds.append(model_pred)
        X_stack = np.column_stack(models_preds)
        final_pred = ensemble_model.predict(X_stack)[:total_horizon]
    else:
        best_model = forecaster.models[forecaster.model_names.index(forecaster.best_model_name)]
        final_pred = rolling_predict(data_with_id, best_model, days=total_horizon, horizon=current_step)
    print(f'{name}: 未来5天预测结果{final_pred} ')
    print(f'MAPE: {metrics_min} MODEL: {forecaster.best_model_name}')

Hong Kong International Airport: 未来5天预测结果[55670.273, 54130.62293147412, 56657.149074260364, 62817.066976302034, 60271.948274615854] 
MAPE: 0.0400809193418922 MODEL: MSTL
Lo Wu: 未来5天预测结果[78187.88, 80333.5739353477, 80148.13705146244, 81973.45547264464, 98345.2434523564] 
MAPE: 0.06802588430365338 MODEL: AutoTBATS
