In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
import lightgbm as lgb
import shap
import warnings
import matplotlib.pyplot as plt
from datetime import timedelta

# 忽略警告
warnings.filterwarnings('ignore')

# 设置 matplotlib 字体以显示中文
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

# ---------------------------
# 模型预测部分
# ---------------------------

class LightGBMModel:
    @staticmethod
    def shap_feature_importance(train_X, train_Y, params, isClassifier=True):
        """
        使用 SHAP 计算特征重要性

        :param train_X: 训练集特征
        :param train_Y: 训练集标签
        :param params: 模型参数
        :param isClassifier: 是否为分类模型
        :return: 特征重要性数据框、SHAP 值和模型
        """
        try:
            if isClassifier:
                model = lgb.LGBMClassifier(**params)
            else:
                model = lgb.LGBMRegressor(**params)

            model.fit(train_X, train_Y)
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(train_X)

            # 计算特征重要性
            mean_abs_shap = np.abs(shap_values).mean(axis=0)
            feature_importance_df = pd.DataFrame({'Feature': train_X.columns, 'Importance': mean_abs_shap})
            feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

            return feature_importance_df, shap_values, model
        except Exception as e:
            print('shap_feature_importance has error: ' + str(e))
            return None, None, None

    @staticmethod
    def grid_tune(train_X, train_Y, fold_time, param_grid, isClassifier=True):
        """
        使用网格搜索进行参数调优

        :param train_X: 训练集特征
        :param train_Y: 训练集标签
        :param fold_time: 交叉验证次数
        :param param_grid: 参数网格
        :param isClassifier: 是否为分类模型
        :return: 最佳模型、最佳参数和交叉验证结果
        """
        try:
            if isClassifier:
                model = lgb.LGBMClassifier()
                scoring = 'accuracy'
            else:
                model = lgb.LGBMRegressor()
                scoring = 'neg_mean_absolute_error'

            tscv = TimeSeriesSplit(n_splits=fold_time)

            grid_search = GridSearchCV(
                estimator=model,
                param_grid=param_grid,
                cv=tscv,
                n_jobs=-1,
                scoring=scoring,
                verbose=1
            )
            grid_search.fit(train_X, train_Y)
            best_model = grid_search.best_estimator_
            best_params = grid_search.best_params_
            cv_results = pd.DataFrame(grid_search.cv_results_)
            return best_model, best_params, cv_results
        except Exception as e:
            print('grid_tune has error: ' + str(e))
            return None, None, None

    @staticmethod
    def random_tune(train_X, train_Y, fold_time, param_distributions, isClassifier=True, n_iter=10):
        """
        使用随机搜索进行参数调优

        :param train_X: 训练集特征
        :param train_Y: 训练集标签
        :param fold_time: 交叉验证次数
        :param param_distributions: 参数分布
        :param isClassifier: 是否为分类模型
        :param n_iter: 随机搜索迭代次数
        :return: 最佳模型、最佳参数和交叉验证结果
        """
        try:
            if isClassifier:
                model = lgb.LGBMClassifier()
                scoring = 'accuracy'
            else:
                model = lgb.LGBMRegressor()
                scoring = 'neg_mean_absolute_error'

            tscv = TimeSeriesSplit(n_splits=fold_time)

            random_search = RandomizedSearchCV(
                estimator=model,
                param_distributions=param_distributions,
                cv=tscv,
                n_jobs=-1,
                scoring=scoring,
                n_iter=n_iter,
                verbose=1,
                random_state=42
            )
            random_search.fit(train_X, train_Y)
            best_model = random_search.best_estimator_
            best_params = random_search.best_params_
            cv_results = pd.DataFrame(random_search.cv_results_)
            return best_model, best_params, cv_results
        except Exception as e:
            print('random_tune has error: ' + str(e))
            return None, None, None

# 读取数据
df = pd.read_csv('完整的合併資料.csv')

# 将 'Date' 列转换为 datetime 类型
df['Date'] = pd.to_datetime(df['Date'])

# 确保数据按照 'ticker' 和 'Date' 排序
df = df.sort_values(['ticker', 'Date']).reset_index(drop=True)

# 定义目标变量和特征
target = 'Adj_Close'  # 请替换为您的目标变量名称
features = [col for col in df.columns if col not in ['Date', 'ticker', target]]

def train_and_predict_single_ticker(group_df, shap_threshold=0.85, use_feature_selection=True):
    # 确保数据按照日期排序
    group_df = group_df.sort_values('Date').reset_index(drop=True)

    # 如果数据量太少，无法训练，则跳过
    if len(group_df) < 20:
        print(f"Ticker {group_df['ticker'].iloc[0]} 数据量不足，跳过")
        return None

    # 使用前 80% 的数据作为训练集，后 20% 的数据作为测试集
    split_index = int(len(group_df) * 0.8)
    train = group_df.iloc[:split_index]
    test = group_df.iloc[split_index:]

    if len(test) == 0:
        print(f"Ticker {group_df['ticker'].iloc[0]} 测试集为空，跳过")
        return None

    X_train = train[features]
    y_train = train[target]
    X_test = test[features]
    y_test = test[target]

    # 模型参数（原始模型）
    params = {
        'boosting_type': 'dart',
        'n_estimators': 100,
        'learning_rate': 0.05,
        'n_jobs': -1,
        'random_state': 7,
        'verbose': 0,
        'min_data_in_leaf': 5
    }

    # 1. 计算 SHAP 值（无论是否进行特征选择）
    try:
        feature_importance_df, shap_values, initial_model = LightGBMModel.shap_feature_importance(
            X_train, y_train, params=params, isClassifier=False)
        if feature_importance_df is not None:
            print(f"Ticker {group_df['ticker'].iloc[0]} - SHAP 计算成功")
        else:
            print(f"Ticker {group_df['ticker'].iloc[0]} - SHAP 计算失败")
    except Exception as e:
        print(f"Ticker {group_df['ticker'].iloc[0]} - SHAP 计算异常: {str(e)}")
        feature_importance_df, shap_values, initial_model = None, None, None

    if use_feature_selection and feature_importance_df is not None:
        try:
            # 计算累计贡献度
            total_importance = feature_importance_df['Importance'].sum()
            feature_importance_df['Cumulative'] = feature_importance_df['Importance'].cumsum() / total_importance
            # 根据阈值选择特征
            important_features = feature_importance_df[feature_importance_df['Cumulative'] <= shap_threshold]['Feature'].tolist()
            print(f"Ticker {group_df['ticker'].iloc[0]} - 选择的特征数量：{len(important_features)}")
            print(f"Ticker {group_df['ticker'].iloc[0]} - 选择的特征：{important_features}")
            selected_features = important_features
            # 筛选 shap_values
            selected_feature_indices = [X_train.columns.get_loc(f) for f in selected_features]
            shap_values_selected = shap_values[:, selected_feature_indices]
        except Exception as e:
            print(f"Ticker {group_df['ticker'].iloc[0]} - 特征筛选失败: {str(e)}")
            selected_features = features.copy()
            shap_values_selected = shap_values
    else:
        # 不进行特征选择，使用所有特征
        selected_features = features.copy()
        shap_values_selected = shap_values
        if not use_feature_selection:
            print(f"Ticker {group_df['ticker'].iloc[0]} - 未进行特征选择，使用所有特征")

    # 使用重要特征进行模型训练（原始模型）
    X_train_important = X_train[selected_features]
    X_test_important = X_test[selected_features]

    # 1. 训练原始模型
    try:
        original_model = lgb.LGBMRegressor(**params)
        original_model.fit(X_train_important, y_train)
        y_pred_original = original_model.predict(X_test_important)
        mae_original = mean_absolute_error(y_test, y_pred_original)
        mape_original = mean_absolute_percentage_error(y_test, y_pred_original) * 100  # 转换为百分比
        mae_original_train = mean_absolute_error(y_train, original_model.predict(X_train_important))
        mape_original_train = mean_absolute_percentage_error(y_train, original_model.predict(X_train_important)) * 100  # 转换为百分比
        print(f"Ticker {group_df['ticker'].iloc[0]} - 原始模型训练集 MAE: {mae_original_train}, MAPE: {mape_original_train}%")
        print(f"Ticker {group_df['ticker'].iloc[0]} - 原始模型测试集 MAE: {mae_original}, MAPE: {mape_original}%")
    except Exception as e:
        print(f"Ticker {group_df['ticker'].iloc[0]} - 原始模型训练失败: {str(e)}")
        return None

    # 2. 参数调优 - 网格搜索
    param_grid = {
        'learning_rate': [0.005, 0.01, 0.03, 0.05, 0.1],
        'max_depth': [3, 5, 7, 9, -1],
        'num_leaves': [31, 63],
        'n_estimators': [100, 200],
        'boosting_type': ['gbdt', 'dart'],
        'random_state': [7],
    }

    best_model_grid, best_params_grid, cv_results_grid = LightGBMModel.grid_tune(
        X_train_important, y_train, fold_time=5, param_grid=param_grid, isClassifier=False)

    if best_model_grid is not None:
        try:
            y_pred_grid = best_model_grid.predict(X_test_important)
            mae_grid = mean_absolute_error(y_test, y_pred_grid)
            mape_grid = mean_absolute_percentage_error(y_test, y_pred_grid) * 100  # 转换为百分比
            mae_grid_train = mean_absolute_error(y_train, best_model_grid.predict(X_train_important))
            mape_grid_train = mean_absolute_percentage_error(y_train, best_model_grid.predict(X_train_important)) * 100  # 转换为百分比
            print(f"Ticker {group_df['ticker'].iloc[0]} - Grid Search 训练集 MAE: {mae_grid_train}, MAPE: {mape_grid_train}%")
            print(f"Ticker {group_df['ticker'].iloc[0]} - Grid Search 测试集 MAE: {mae_grid}, MAPE: {mape_grid}%")
        except Exception as e:
            print(f"Ticker {group_df['ticker'].iloc[0]} - Grid Search 预测失败: {str(e)}")
            mae_grid, mape_grid, mae_grid_train, mape_grid_train = None, None, None, None
    else:
        print(f"Ticker {group_df['ticker'].iloc[0]} - Grid Search 未找到最佳模型，使用原始模型的预测结果。")
        best_model_grid = original_model
        y_pred_grid = y_pred_original
        mae_grid = mae_original
        mape_grid = mape_original
        mae_grid_train = mae_original_train
        mape_grid_train = mape_original_train

    # 3. 参数调优 - 随机搜索
    param_dist = {
        'learning_rate': [0.005, 0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7, 9, -1],
        'num_leaves': [15, 31, 63, 127],
        'n_estimators': [50, 100, 200, 500],
        'min_child_samples': [5, 10, 20, 50],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'lambda_l1': [0, 0.01, 0.1, 1],
        'lambda_l2': [0, 0.01, 0.1, 1],
        'boosting_type': ['gbdt', 'dart'],
        'random_state': [7],
    }

    best_model_random, best_params_random, cv_results_random = LightGBMModel.random_tune(
        X_train_important, y_train, fold_time=5, param_distributions=param_dist, isClassifier=False, n_iter=100)

    if best_model_random is not None:
        try:
            y_pred_random = best_model_random.predict(X_test_important)
            mae_random = mean_absolute_error(y_test, y_pred_random)
            mape_random = mean_absolute_percentage_error(y_test, y_pred_random) * 100  # 转换为百分比
            mae_random_train = mean_absolute_error(y_train, best_model_random.predict(X_train_important))
            mape_random_train = mean_absolute_percentage_error(y_train, best_model_random.predict(X_train_important)) * 100  # 转换为百分比
            print(f"Ticker {group_df['ticker'].iloc[0]} - Random Search 训练集 MAE: {mae_random_train}, MAPE: {mape_random_train}%")
            print(f"Ticker {group_df['ticker'].iloc[0]} - Random Search 测试集 MAE: {mae_random}, MAPE: {mape_random}%")
        except Exception as e:
            print(f"Ticker {group_df['ticker'].iloc[0]} - Random Search 预测失败: {str(e)}")
            mae_random, mape_random, mae_random_train, mape_random_train = None, None, None, None
    else:
        print(f"Ticker {group_df['ticker'].iloc[0]} - Random Search 未找到最佳模型，使用原始模型的预测结果。")
        best_model_random = original_model
        y_pred_random = y_pred_original
        mae_random = mae_original
        mape_random = mape_original
        mae_random_train = mae_original_train
        mape_random_train = mape_original_train

    # 选择 MAE 最小的模型
    mae_values = {}
    if mae_original is not None:
        mae_values['original'] = mae_original
    if mae_grid is not None:
        mae_values['grid_search'] = mae_grid
    if mae_random is not None:
        mae_values['random_search'] = mae_random

    if not mae_values:
        print(f"Ticker {group_df['ticker'].iloc[0]} - 没有有效的 MAE 值，跳过模型选择")
        return None

    best_model_type = min(mae_values, key=mae_values.get)
    best_mae = mae_values[best_model_type]
    print(f"Ticker {group_df['ticker'].iloc[0]} - 选择的最佳模型: {best_model_type}，测试集 MAE: {best_mae}")

    if best_model_type == 'original':
        final_predicted = y_pred_original
    elif best_model_type == 'grid_search':
        final_predicted = y_pred_grid
    else:
        final_predicted = y_pred_random

    # 构建结果字典
    result_dict = {
        'ticker': group_df['ticker'].iloc[0],
        'selected_features': selected_features,
        'original_train_mae': mae_original_train,
        'original_train_mape': mape_original_train,
        'original_test_mae': mae_original,
        'original_test_mape': mape_original,
        'grid_train_mae': mae_grid_train if mae_grid_train is not None else np.nan,
        'grid_train_mape': mape_grid_train if mape_grid_train is not None else np.nan,
        'grid_test_mae': mae_grid if mae_grid is not None else np.nan,
        'grid_test_mape': mape_grid if mape_grid is not None else np.nan,
        'random_train_mae': mae_random_train if mae_random_train is not None else np.nan,
        'random_train_mape': mape_random_train if mape_random_train is not None else np.nan,
        'random_test_mae': mae_random if mae_random is not None else np.nan,
        'random_test_mape': mape_random if mape_random is not None else np.nan,
        'best_model_type': best_model_type,
        'best_mae': best_mae,
        'original_model': original_model,
        'best_model_grid': best_model_grid,
        'best_model_random': best_model_random,
        'y_test': y_test,
        'y_pred_original': y_pred_original,
        'y_pred_grid': y_pred_grid if mae_grid is not None else None,
        'y_pred_random': y_pred_random if mae_random is not None else None,
        'test_dates': test['Date'].values,
        'shap_values': shap_values,
        'shap_feature_names': X_train_important.columns.tolist(),
        'X_train_important': X_train_important,
        'best_params_grid': best_params_grid,
        'best_params_random': best_params_random,
        'cv_results_grid': cv_results_grid,
        'cv_results_random': cv_results_random,
    }

    # 筛选 shap_values 仅包含 selected_features
    if shap_values_selected is not None:
        try:
            # 打印形状信息
            print(f"Ticker {group_df['ticker'].iloc[0]} - shap_values_selected.shape: {shap_values_selected.shape}")
            print(f"Ticker {group_df['ticker'].iloc[0]} - X_train_important.shape: {X_train_important.shape}")

            # 确保 shap_values_selected 的样本数与 X_train_important 一致
            if shap_values_selected.shape[0] != X_train_important.shape[0]:
                print(f"Ticker {group_df['ticker'].iloc[0]} - shap_values_selected 样本数与 X_train_important 不匹配")
                result_dict['shap_values_selected'] = None
            else:
                result_dict['shap_values_selected'] = shap_values_selected
        except Exception as e:
            print(f"Ticker {group_df['ticker'].iloc[0]} - 筛选 shap_values 失败: {str(e)}")
            result_dict['shap_values_selected'] = None
    else:
        result_dict['shap_values_selected'] = None

    # 将最终预测结果保存到结果字典中，包括实际值和最终预测值
    predictions = pd.DataFrame({
        'Date': test['Date'].values,
        'ticker': group_df['ticker'].iloc[0],
        'Actual_Return': y_test.values,
        'Predicted_Return': final_predicted,
        'Best_Model': best_model_type
    })
    result_dict['predictions'] = predictions

    return result_dict

# 初始化结果列表
results = []

# 按照 'ticker' 分组
grouped = df.groupby('ticker')

for name, group in grouped:
    print(f"\n正在处理 Ticker: {name}")
    result = train_and_predict_single_ticker(
        group,
        shap_threshold=0.85,          # 您可以更改 shap_threshold 的值
        use_feature_selection=False    # 设置为 True，进行特征选择；设置为 False，不进行特征选择
    )
    if result is not None:
        results.append(result)

# 将所有预测结果合并
if results:
    all_predictions = pd.concat([res['predictions'] for res in results], ignore_index=True)

    # 确保日期格式正确
    all_predictions['Date'] = pd.to_datetime(all_predictions['Date'])

    # 保存预测结果为 CSV 文件（可选）
    all_predictions.to_csv('predictions.csv', index=False)
    print("\n所有预测结果已保存为 'predictions.csv'")
else:
    print("没有任何预测结果被生成。")

# 绘制 SHAP 图
shap.initjs()

for res in results:
    ticker = res['ticker']
    shap_values_selected = res['shap_values_selected']  # 使用筛选后的 shap_values
    feature_names = res['shap_feature_names']
    X_train_important = res['X_train_important']

    print(f"\n绘制 Ticker {ticker} 的 SHAP 图")

    # 打印 SHAP 值和数据矩阵的形状
    if shap_values_selected is not None:
        print(f"Ticker {ticker} - shap_values_selected.shape: {shap_values_selected.shape}")
        print(f"Ticker {ticker} - X_train_important.shape: {X_train_important.shape}")

    # 绘制 SHAP summary plot
    if shap_values_selected is not None and X_train_important.shape[0] > 0:
        try:
            plt.figure(figsize=(10, 6))
            shap.summary_plot(shap_values_selected, X_train_important, feature_names=feature_names, show=False)
            plt.title(f"Ticker {ticker} - SHAP Summary Plot")
            plt.savefig(f'shap_summary_{ticker}.png', dpi=300)
            plt.close()
            print(f"保存 Ticker {ticker} 的 SHAP Summary Plot 为 'shap_summary_{ticker}.png'")

            # 绘制 SHAP bar plot
            plt.figure(figsize=(10, 6))
            shap.summary_plot(shap_values_selected, X_train_important, feature_names=feature_names, plot_type="bar", show=False)
            plt.title(f"Ticker {ticker} - SHAP Feature Importance")
            plt.savefig(f'shap_bar_{ticker}.png', dpi=300)
            plt.close()
            print(f"保存 Ticker {ticker} 的 SHAP Feature Importance 图为 'shap_bar_{ticker}.png'")
        except Exception as e:
            print(f"Ticker {ticker} 的 SHAP 绘图失败: {str(e)}")
    else:
        print(f"Ticker {ticker} 的 SHAP 值计算失败，无法绘制 SHAP 图")

    # 绘制预测值与真实值的比较图
    predictions = res['predictions']
    if predictions is not None and not predictions.empty:
        plt.figure(figsize=(12, 6))
        plt.plot(predictions['Date'], predictions['Actual_Return'], label='Actual Return', marker='o')
        plt.plot(predictions['Date'], predictions['Predicted_Return'], label='Predicted Return', marker='x')
        plt.xlabel('Date')
        plt.ylabel('Return')
        plt.title(f'Ticker {ticker} - Actual vs Predicted Returns')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f'actual_vs_predicted_{ticker}.png', dpi=300)
        plt.close()
        print(f"保存 Ticker {ticker} 的 Actual vs Predicted Returns 图为 'actual_vs_predicted_{ticker}.png'")
    else:
        print(f"Ticker {ticker} 没有足够的数据绘制 Actual vs Predicted Returns 图")

    # 绘制预测误差分布图（可选）
    if predictions is not None and not predictions.empty:
        plt.figure(figsize=(10, 6))
        error = predictions['Actual_Return'] - predictions['Predicted_Return']
        plt.hist(error, bins=30, edgecolor='k', alpha=0.7)
        plt.xlabel('Prediction Error (Actual - Predicted)')
        plt.ylabel('Frequency')
        plt.title(f'Ticker {ticker} - Prediction Error Distribution')
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f'prediction_error_distribution_{ticker}.png', dpi=300)
        plt.close()
        print(f"保存 Ticker {ticker} 的 Prediction Error Distribution 图为 'prediction_error_distribution_{ticker}.png'")

# 输出每个 ticker 的 MAE、MAPE 和最佳模型类型
for res in results:
    ticker = res['ticker']
    print(f"\n=== {ticker} 的模型性能 ===")
    print(f"原始模型训练集 MAE: {res['original_train_mae']}, MAPE: {res['original_train_mape']}%")
    print(f"原始模型测试集 MAE: {res['original_test_mae']}, MAPE: {res['original_test_mape']}%")
    print(f"Grid Search 调优模型训练集 MAE: {res['grid_train_mae']}, MAPE: {res['grid_train_mape']}%")
    print(f"Grid Search 调优模型测试集 MAE: {res['grid_test_mae']}, MAPE: {res['grid_test_mape']}%")
    print(f"Random Search 调优模型训练集 MAE: {res['random_train_mae']}, MAPE: {res['random_train_mape']}%")
    print(f"Random Search 调优模型测试集 MAE: {res['random_test_mae']}, MAPE: {res['random_test_mape']}%")
    print(f"最佳模型类型: {res['best_model_type']}")
    print(f"最佳模型测试集 MAE: {res['best_mae']}")



預測部分

In [None]:
# 新的 Jupyter Cell

# 假设您已经运行了上面的代码，并且所有的变量都已经定义好

# 我们将对每个 ticker 进行逐步预测，每次重新训练模型
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# 初始化一个列表，用于存储所有 ticker 的预测结果
all_results = []

# 初始化一个列表，用于存储每个 ticker 的平均 MAE 和 MAPE
mae_mape_list = []

for res in results:
    ticker = res['ticker']
    print(f"\n开始对 Ticker {ticker} 进行逐步预测（每次重新训练模型）")
    
    # 获取该 ticker 的数据
    group_df = df[df['ticker'] == ticker].sort_values('Date').reset_index(drop=True)
    
    # 定义特征和目标
    target = 'Adj_Close'  # 请确保与之前的代码一致
    features = res['selected_features']  # 使用之前选择的特征
    
    # 初始化用于存储预测结果的列表
    dates = []
    actuals = []
    predictions = []
    train_maes = []
    train_mapes = []
    test_maes = []
    test_mapes = []
    
    # 设置初始训练集大小，例如使用前 80% 的数据作为初始训练集
    initial_train_size = int(len(group_df) * 0.8)
    train_df = group_df.iloc[:initial_train_size]
    test_df = group_df.iloc[initial_train_size:].reset_index(drop=True)
    
    print(f"Ticker {ticker} 的初始训练集大小: {len(train_df)}, 测试集大小: {len(test_df)}")
    
    # 使用之前最好的模型参数（根据最佳模型类型）
    best_model_type = res['best_model_type']
    if best_model_type == 'original':
        params = res['original_model'].get_params()
    elif best_model_type == 'grid_search' and res['best_params_grid'] is not None:
        params = res['best_params_grid']
    elif best_model_type == 'random_search' and res['best_params_random'] is not None:
        params = res['best_params_random']
    else:
        # 如果没有最佳参数，使用默认参数
        params = {
            'boosting_type': 'dart',
            'n_estimators': 100,
            'learning_rate': 0.05,
            'n_jobs': -1,
            'random_state': 7,
            'verbose': 0,
            'min_data_in_leaf': 5
        }
    
    # 开始逐步预测
    for i in range(len(test_df)):
        # 当前日期
        current_date = test_df['Date'].iloc[i]
        
        # 当前训练集，包括初始训练集和之前的测试数据
        current_train_df = group_df.iloc[:initial_train_size + i]
        X_train = current_train_df[features]
        y_train = current_train_df[target]
        
        # 当前测试样本
        X_test = test_df[features].iloc[[i]]  # 注意这里要保持二维数组
        y_test = test_df[target].iloc[[i]]    # 保持 Series 格式
        
        # 训练模型（重新训练）
        model = lgb.LGBMRegressor(**params)
        model.fit(X_train, y_train)
        
        # 预测
        y_pred = model.predict(X_test)
        
        # 计算训练集 MAE 和 MAPE
        y_train_pred = model.predict(X_train)
        train_mae = mean_absolute_error(y_train, y_train_pred)
        train_mape = mean_absolute_percentage_error(y_train, y_train_pred) * 100  # 转换为百分比
        
        # 计算测试集（单个样本）MAE 和 MAPE
        test_mae = mean_absolute_error(y_test, y_pred)
        test_mape = mean_absolute_percentage_error(y_test, y_pred) * 100  # 转换为百分比
        
        # 保存结果
        dates.append(current_date)
        actuals.append(y_test.values[0])
        predictions.append(y_pred[0])  # y_pred 是数组
        train_maes.append(train_mae)
        train_mapes.append(train_mape)
        test_maes.append(test_mae)
        test_mapes.append(test_mape)
        
        # 可选：打印当前进度
        print(f"日期: {current_date.date()}, 实际值: {y_test.values[0]}, 预测值: {y_pred[0]}, 训练 MAE: {train_mae:.4f}, 训练 MAPE: {train_mape:.2f}%, 测试 MAE: {test_mae:.4f}, 测试 MAPE: {test_mape:.2f}%")
    
    # 将结果转换为 DataFrame
    results_df = pd.DataFrame({
        'Date': dates,
        'Actual': actuals,
        'Predicted': predictions,
        'Train_MAE': train_maes,
        'Train_MAPE': train_mapes,
        'Test_MAE': test_maes,
        'Test_MAPE': test_mapes
    })
    
    # 添加 'Ticker' 列
    results_df['Ticker'] = ticker
    
    # 将结果添加到所有结果的列表中
    all_results.append(results_df)
    
    # 计算该 ticker 的平均训练和测试 MAE、MAPE
    avg_train_mae = np.mean(train_maes)
    avg_train_mape = np.mean(train_mapes)
    avg_test_mae = np.mean(test_maes)
    avg_test_mape = np.mean(test_mapes)
    
    # 将平均 MAE 和 MAPE 添加到列表中
    mae_mape_list.append({
        'Ticker': ticker,
        'Avg_Train_MAE': avg_train_mae,
        'Avg_Train_MAPE': avg_train_mape,
        'Avg_Test_MAE': avg_test_mae,
        'Avg_Test_MAPE': avg_test_mape
    })
    
    # 绘制实际值和预测值的对比图
    plt.figure(figsize=(12, 6))
    plt.plot(results_df['Date'], results_df['Actual'], label='Actual', marker='o')
    plt.plot(results_df['Date'], results_df['Predicted'], label='Predicted', marker='x')
    plt.xlabel('Date')
    plt.ylabel(target)
    plt.title(f'Ticker {ticker} - 逐步预测结果（每次重新训练模型）')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'step_by_step_prediction_{ticker}.png', dpi=300)
    plt.show()
    print(f"保存 Ticker {ticker} 的逐步预测图为 'step_by_step_prediction_{ticker}.png'")
    
    # 绘制训练集 MAE 和 测试集 MAE 的变化趋势
    plt.figure(figsize=(12, 6))
    plt.plot(results_df['Date'], results_df['Train_MAE'], label='Train MAE', marker='o')
    plt.plot(results_df['Date'], results_df['Test_MAE'], label='Test MAE', marker='x')
    plt.xlabel('Date')
    plt.ylabel('MAE')
    plt.title(f'Ticker {ticker} - MAE 随时间的变化')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'mae_over_time_{ticker}.png', dpi=300)
    plt.show()
    print(f"保存 Ticker {ticker} 的 MAE 变化图为 'mae_over_time_{ticker}.png'")
    
    # 绘制训练集 MAPE 和 测试集 MAPE 的变化趋势
    plt.figure(figsize=(12, 6))
    plt.plot(results_df['Date'], results_df['Train_MAPE'], label='Train MAPE', marker='o')
    plt.plot(results_df['Date'], results_df['Test_MAPE'], label='Test MAPE', marker='x')
    plt.xlabel('Date')
    plt.ylabel('MAPE (%)')
    plt.title(f'Ticker {ticker} - MAPE 随时间的变化')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'mape_over_time_{ticker}.png', dpi=300)
    plt.show()
    print(f"保存 Ticker {ticker} 的 MAPE 变化图为 'mape_over_time_{ticker}.png'")

# 在循环结束后，合并所有的预测结果
if all_results:
    consolidated_results = pd.concat(all_results, ignore_index=True)
    
    # 保存所有预测结果到 CSV 文件
    consolidated_results.to_csv('consolidated_predictions.csv', index=False)
    print("所有 ticker 的预测结果已保存到 'consolidated_predictions.csv'")
else:
    print("没有任何预测结果生成。")

# 将平均 MAE 和 MAPE 保存到 CSV 文件
mae_mape_df = pd.DataFrame(mae_mape_list)
mae_mape_df.to_csv('ticker_mae_mape.csv', index=False)
print("所有 ticker 的平均 MAE 和 MAPE 已保存到 'ticker_mae_mape.csv'")


投組部分

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import empyrical as ep
from pypfopt import EfficientFrontier, expected_returns, risk_models

# 1. 读取股票价格数据
df_prices = pd.read_csv('股價.csv')

# 转换 'Date' 为日期类型
df_prices['Date'] = pd.to_datetime(df_prices['Date'])

# 数据透视，将 'ticker' 作为列，将 'Adj_Close' 作为值
df_prices = df_prices.pivot(index='Date', columns='ticker', values='Adj_Close')

# 数据清洗
df_prices.dropna(how='all', inplace=True)
df_prices = df_prices.apply(lambda x: pd.to_numeric(x.astype(str).str.replace(',', ''), errors='coerce'))
df_prices[df_prices <= 0] = np.nan

# 填充缺失值
df_prices.fillna(method='ffill', inplace=True)
df_prices.fillna(method='bfill', inplace=True)

# 将列名转换为字符串类型
df_prices.columns = df_prices.columns.astype(str)

# 2. 读取预测数据
all_predictions = pd.read_csv('predictions.csv')

# 转换数据类型
all_predictions['Date'] = pd.to_datetime(all_predictions['Date'])
all_predictions['ticker'] = all_predictions['ticker'].astype(str)
all_predictions['Predicted_Return'] = pd.to_numeric(all_predictions['Predicted_Return'], errors='coerce')

# 按照日期和股票代码排序
all_predictions.sort_values(by=['ticker', 'Date'], inplace=True)

# 去除缺失的预测回报率
all_predictions.dropna(subset=['Predicted_Return'], inplace=True)

# 获取预测数据中的最早日期
start_date = all_predictions['Date'].min()
end_date = df_prices.index.max()

# 确保价格数据包含预测数据的日期范围
df_prices = df_prices.loc[start_date:]

# 按周汇总预测回报率（取平均值）
all_predictions['Week_Start'] = all_predictions['Date'] - pd.to_timedelta(all_predictions['Date'].dt.weekday, unit='d')
weekly_predictions = all_predictions.groupby(['Week_Start', 'ticker'])['Predicted_Return'].mean().reset_index()

# 设置投资组合初始资金
initial_portfolio_value = 100000000
cash = initial_portfolio_value

# 初始化持仓和投资组合价值记录
current_allocation = {}
portfolio_value_per_day = pd.DataFrame(index=df_prices.index)
portfolio_value_per_day['Total'] = np.nan

# 记录每次调整的权重和交易信息
trade_history = []

# 获取预测数据中所有周的列表
weeks = pd.date_range(start=start_date, end=end_date, freq='W-MON')

# 定义辅助函数
def get_next_trading_day(target_date, price_index):
    """
    如果 target_date 存在于 price_index 中，返回 target_date。
    否则，返回第一个在 target_date 之后存在于 price_index 中的日期。
    如果找不到，返回 None。
    """
    if target_date in price_index:
        return target_date
    else:
        # 查找所有大于 target_date 的日期
        future_dates = price_index[price_index > target_date]
        if not future_dates.empty:
            return future_dates.min()
        else:
            return None

def get_top_20_stocks_weekly(predictions_df, date):
    # 获取指定周的预测结果
    predictions_on_week = predictions_df[predictions_df['Week_Start'] == date]
    
    # 如果没有预测结果，返回空列表
    if predictions_on_week.empty:
        return [], predictions_on_week
    
    # 按照预测回报率排序，选出前20只股票
    top_20 = predictions_on_week.sort_values(by='Predicted_Return', ascending=False).head(20)
    
    # 将股票代码转换为字符串
    top_20['ticker'] = top_20['ticker'].astype(str)
    
    return top_20['ticker'].tolist(), top_20

def adjust_portfolio(top_stocks, valid_prices, current_allocation, cash, predictions_on_date, rebalance_date):
    # 获取历史价格数据
    historical_data = df_prices[top_stocks].loc[:rebalance_date].dropna()
    
    # 如果历史数据不足，使用默认的等权重
    if historical_data.shape[0] < 2:
        weights = {stock: 1/len(top_stocks) for stock in top_stocks}
    else:
        # 使用 PyPortfolioOpt 计算预期收益率和协方差矩阵
        mu = expected_returns.mean_historical_return(historical_data)
        S = risk_models.CovarianceShrinkage(historical_data).ledoit_wolf()
        
        try:
            # 构建有效前沿模型，计算最大夏普比率的权重
            ef = EfficientFrontier(mu, S, weight_bounds=(0, 1))
            raw_weights = ef.max_sharpe()
            cleaned_weights = ef.clean_weights()
            weights = cleaned_weights
        except Exception as e:
            print(f"优化时出错，使用等权重：{e}")
            weights = {stock: 1/len(top_stocks) for stock in top_stocks}
    
    # 确保权重的股票代码为字符串类型
    weights = {str(k): v for k, v in weights.items()}
    
    # 记录当前的交易信息
    trade_info = {
        'Date': rebalance_date,
        'Buy': [],
        'Sell': [],
        'Weights': weights
    }
    
    # 计算当前投资组合总价值
    total_portfolio_value = sum(current_allocation.get(stock, 0) * valid_prices.get(stock, 0) for stock in current_allocation) + cash
    
    # 计算目标持仓
    valid_prices = {k: float(v) for k, v in valid_prices.items() if pd.notna(v) and isinstance(v, (int, float))}
    target_allocations = {stock: weights.get(stock, 0) * total_portfolio_value for stock in top_stocks if stock in valid_prices}
    target_shares = {stock: target_allocations[stock] / valid_prices[stock] for stock in target_allocations}
    
    # **先卖出需要卖出的股票**
    # 卖出未在新持仓中的股票
    for stock in list(current_allocation.keys()):
        if stock not in top_stocks:
            shares_to_sell = current_allocation[stock]
            price = valid_prices.get(stock, df_prices.loc[rebalance_date, stock])
            if pd.notna(price) and isinstance(price, (int, float)):
                proceeds = shares_to_sell * price
                fee = proceeds * 0.004425  # 卖出手续费 0.4425%
                cash += proceeds - fee
                print(f"卖出 {stock} 的 {shares_to_sell:.2f} 股，获得现金 {proceeds - fee:.2f}，手续费 {fee:.2f}")
                trade_info['Sell'].append({'Stock': stock, 'Shares': shares_to_sell, 'Proceeds': proceeds - fee, 'Fee': fee})
                del current_allocation[stock]
    
    # 卖出需要减少的股票
    for stock in top_stocks:
        if stock in current_allocation:
            target_shares_to_hold = target_shares.get(stock, 0)
            current_shares = current_allocation.get(stock, 0)
            price = valid_prices.get(stock, 0)
            if current_shares > target_shares_to_hold:
                # 卖出多余的部分
                shares_to_sell = current_shares - target_shares_to_hold
                proceeds = shares_to_sell * price
                fee = proceeds * 0.004425  # 卖出手续费 0.4425%
                cash += proceeds - fee
                current_allocation[stock] = target_shares_to_hold
                print(f"卖出 {stock} 的 {shares_to_sell:.2f} 股，获得现金 {proceeds - fee:.2f}，手续费 {fee:.2f}")
                trade_info['Sell'].append({'Stock': stock, 'Shares': shares_to_sell, 'Proceeds': proceeds - fee, 'Fee': fee})
    
    # **然后买入需要买入的股票**
    for stock in top_stocks:
        target_shares_to_hold = target_shares.get(stock, 0)
        current_shares = current_allocation.get(stock, 0)
        price = valid_prices.get(stock, 0)
        if current_shares < target_shares_to_hold:
            # 买入缺少的部分
            shares_to_buy = target_shares_to_hold - current_shares
            cost = shares_to_buy * price
            fee = cost * 0.001425  # 买入手续费 0.1425%
            total_cost = cost + fee
            if total_cost <= cash:
                cash -= total_cost
                current_allocation[stock] = target_shares_to_hold
                print(f"买入 {stock} 的 {shares_to_buy:.2f} 股，花费现金 {total_cost:.2f}，手续费 {fee:.2f}")
                trade_info['Buy'].append({'Stock': stock, 'Shares': shares_to_buy, 'Cost': total_cost, 'Fee': fee})
            else:
                # 现金不足，按比例购买
                affordable_shares = (cash / (price + price * 0.001425))
                shares_to_buy = affordable_shares
                cost = shares_to_buy * price
                fee = cost * 0.001425
                total_cost = cost + fee
                cash -= total_cost
                current_allocation[stock] = current_shares + shares_to_buy
                print(f"现金不足，调整 {stock} 的买入数量为 {shares_to_buy:.2f} 股，花费现金 {total_cost:.2f}，手续费 {fee:.2f}")
                trade_info['Buy'].append({'Stock': stock, 'Shares': shares_to_buy, 'Cost': total_cost, 'Fee': fee})
    
    # 记录交易信息
    trade_history.append(trade_info)
    
    return current_allocation, cash

# 主循环
for week_start in weeks:
    print(f"\n处理周开始日期：{week_start.strftime('%Y-%m-%d')}")
    week_end = week_start + pd.Timedelta(days=6)  # 一周的结束日期
    
    # 获取本周的开始日期
    rebalance_date = week_start
    
    # 检查 rebalance_date 是否存在于 df_prices.index 中
    actual_rebalance_date = get_next_trading_day(rebalance_date, df_prices.index)
    if actual_rebalance_date is None:
        print(f"{rebalance_date.strftime('%Y-%m-%d')} 之后没有可用的交易日，跳过该周。")
        continue
    elif actual_rebalance_date != rebalance_date:
        print(f"原始 rebalance_date {rebalance_date.strftime('%Y-%m-%d')} 不存在，使用下一个交易日 {actual_rebalance_date.strftime('%Y-%m-%d')}。")
        rebalance_date = actual_rebalance_date
    
    # 获取预测回报率最高的20只股票
    top_stocks, predictions_on_date = get_top_20_stocks_weekly(weekly_predictions, rebalance_date)
    if not top_stocks:
        print(f"{rebalance_date.strftime('%Y-%m-%d')} 没有可用的预测结果，跳过该周。")
        continue
    
    # 确保 df_prices 的列名是字符串类型
    df_prices.columns = df_prices.columns.astype(str)
    # 将 top_stocks 中的股票代码转换为字符串
    top_stocks = [str(stock) for stock in top_stocks]
    
    # 获取本周第一个交易日的价格
    try:
        week_prices = df_prices.loc[rebalance_date:week_end, top_stocks]
        earliest_prices = week_prices.iloc[0]
    except IndexError:
        print(f"{rebalance_date.strftime('%Y-%m-%d')} 没有足够的交易日，跳过该周。")
        continue
    except KeyError as e:
        print(f"{rebalance_date.strftime('%Y-%m-%d')} 发生 KeyError：{e}")
        continue
    
    earliest_prices = earliest_prices.dropna()
    valid_prices = earliest_prices.to_dict()
    
    # 过滤掉没有价格数据的股票
    top_stocks = [stock for stock in top_stocks if stock in valid_prices]
    predictions_on_date = predictions_on_date[predictions_on_date['ticker'].isin(top_stocks)]
    
    if not top_stocks:
        print(f"{rebalance_date.strftime('%Y-%m-%d')} 无有效的股票可交易，跳过该周。")
        continue
    
    # 调整投资组合
    current_allocation, cash = adjust_portfolio(top_stocks, valid_prices, current_allocation, cash, predictions_on_date, rebalance_date)
    
    # 计算每日投资组合市值
    held_stocks = list(current_allocation.keys())
    if held_stocks:
        try:
            week_prices = df_prices.loc[rebalance_date:week_end, held_stocks]
            holdings = pd.Series(current_allocation)
            daily_portfolio_value = (week_prices * holdings).sum(axis=1) + cash
            portfolio_value_per_day.loc[rebalance_date:week_end, 'Total'] = daily_portfolio_value
        except Exception as e:
            print(f"{rebalance_date.strftime('%Y-%m-%d')}: 计算每日市值时出错: {e}")

# 填充缺失的投资组合市值
portfolio_value_per_day['Total'].fillna(method='ffill', inplace=True)

# 计算投资组合的每日收益率和累计收益率
daily_returns = portfolio_value_per_day['Total'].pct_change().dropna()
portfolio_value_per_day['Cumulative Returns'] = (1 + daily_returns).cumprod() - 1

# 使用 empyrical 计算投资组合的绩效指标
annual_return = ep.annual_return(daily_returns)
sharpe_ratio = ep.sharpe_ratio(daily_returns)
max_drawdown = ep.max_drawdown(daily_returns)

print("\n=== 投资组合绩效指标 ===")
print(f"年化收益率: {annual_return:.2%}")
print(f"夏普比率: {sharpe_ratio:.2f}")
print(f"最大回撤: {max_drawdown:.2%}")

# 读取加权指数数据
index_prices = pd.read_csv('TWII.csv')
index_prices['Date'] = pd.to_datetime(index_prices['Date'])
index_prices.set_index('Date', inplace=True)
index_prices.sort_index(inplace=True)

# 确保加权指数数据起始日期正确
index_prices = index_prices[index_prices.index >= start_date]

# 将 'Close' 列的字符串转换为浮点数
index_prices['Close'] = index_prices['Close'].astype(str).str.replace(',', '').astype(float)

# 对齐指数数据和投资组合数据的日期范围
common_dates = portfolio_value_per_day.index.intersection(index_prices.index)
index_prices = index_prices.loc[common_dates]
portfolio_value_per_day = portfolio_value_per_day.loc[common_dates]

# 计算加权指数的每日收益率和累计收益率
index_prices['Index Returns'] = index_prices['Close'].pct_change()
index_prices['Index Cumulative Returns'] = (1 + index_prices['Index Returns']).cumprod() - 1

# 计算加权指数的绩效指标
index_daily_returns = index_prices['Index Returns'].dropna()
index_annual_return = ep.annual_return(index_daily_returns)
index_sharpe_ratio = ep.sharpe_ratio(index_daily_returns)
index_max_drawdown = ep.max_drawdown(index_daily_returns)

print("\n=== 加权指数绩效指标 ===")
print(f"年化收益率: {index_annual_return:.2%}")
print(f"夏普比率: {index_sharpe_ratio:.2f}")
print(f"最大回撤: {index_max_drawdown:.2%}")

# 绘制投资组合和加权指数的累计收益率比较图
plt.figure(figsize=(12, 6))
plt.plot(portfolio_value_per_day.index, portfolio_value_per_day['Cumulative Returns'], label='投资组合累计收益率', color='blue')
plt.plot(index_prices.index, index_prices['Index Cumulative Returns'], label='加权指数累计收益率', color='red')
plt.title('投资组合与加权指数累计收益率比较')
plt.xlabel('日期')
plt.ylabel('累计收益率')
plt.legend()
plt.grid(True)
plt.show()

# 保存交易历史到 CSV 文件
trade_history_df = pd.DataFrame(trade_history)
trade_history_df.to_csv('trade_history.csv', index=False)
print("交易历史已保存到 'trade_history.csv'")


GPU部分(不確定)

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
import lightgbm as lgb
import shap
import warnings
import matplotlib.pyplot as plt
from datetime import timedelta

# 忽略警告
warnings.filterwarnings('ignore')

# 设置 matplotlib 字体以显示中文
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

# ---------------------------
# 模型预测部分
# ---------------------------

class LightGBMModel:
    @staticmethod
    def shap_feature_importance(train_X, train_Y, params, isClassifier=True):
        """
        使用 SHAP 计算特征重要性

        :param train_X: 训练集特征
        :param train_Y: 训练集标签
        :param params: 模型参数
        :param isClassifier: 是否为分类模型
        :return: 特征重要性数据框、SHAP 值和模型
        """
        try:
            if isClassifier:
                model = lgb.LGBMClassifier(**params, device='gpu')
            else:
                model = lgb.LGBMRegressor(**params, device='gpu')

            model.fit(train_X, train_Y)
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(train_X)

            # 计算特征重要性
            mean_abs_shap = np.abs(shap_values).mean(axis=0)
            feature_importance_df = pd.DataFrame({'Feature': train_X.columns, 'Importance': mean_abs_shap})
            feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

            return feature_importance_df, shap_values, model
        except Exception as e:
            print('shap_feature_importance has error: ' + str(e))
            return None, None, None

    @staticmethod
    def grid_tune(train_X, train_Y, fold_time, param_grid, isClassifier=True):
        """
        使用网格搜索进行参数调优

        :param train_X: 训练集特征
        :param train_Y: 训练集标签
        :param fold_time: 交叉验证次数
        :param param_grid: 参数网格
        :param isClassifier: 是否为分类模型
        :return: 最佳模型、最佳参数和交叉验证结果
        """
        try:
            if isClassifier:
                model = lgb.LGBMClassifier(device='gpu')
                scoring = 'accuracy'
            else:
                model = lgb.LGBMRegressor(device='gpu')
                scoring = 'neg_mean_absolute_error'

            tscv = TimeSeriesSplit(n_splits=fold_time)

            grid_search = GridSearchCV(
                estimator=model,
                param_grid=param_grid,
                cv=tscv,
                n_jobs=-1,
                scoring=scoring,
                verbose=1
            )
            grid_search.fit(train_X, train_Y)
            best_model = grid_search.best_estimator_
            best_params = grid_search.best_params_
            cv_results = pd.DataFrame(grid_search.cv_results_)
            return best_model, best_params, cv_results
        except Exception as e:
            print('grid_tune has error: ' + str(e))
            return None, None, None

    @staticmethod
    def random_tune(train_X, train_Y, fold_time, param_distributions, isClassifier=True, n_iter=10):
        """
        使用随机搜索进行参数调优

        :param train_X: 训练集特征
        :param train_Y: 训练集标签
        :param fold_time: 交叉验证次数
        :param param_distributions: 参数分布
        :param isClassifier: 是否为分类模型
        :param n_iter: 随机搜索迭代次数
        :return: 最佳模型、最佳参数和交叉验证结果
        """
        try:
            if isClassifier:
                model = lgb.LGBMClassifier(device='gpu')
                scoring = 'accuracy'
            else:
                model = lgb.LGBMRegressor(device='gpu')
                scoring = 'neg_mean_absolute_error'

            tscv = TimeSeriesSplit(n_splits=fold_time)

            random_search = RandomizedSearchCV(
                estimator=model,
                param_distributions=param_distributions,
                cv=tscv,
                n_jobs=-1,
                scoring=scoring,
                n_iter=n_iter,
                verbose=1,
                random_state=42
            )
            random_search.fit(train_X, train_Y)
            best_model = random_search.best_estimator_
            best_params = random_search.best_params_
            cv_results = pd.DataFrame(random_search.cv_results_)
            return best_model, best_params, cv_results
        except Exception as e:
            print('random_tune has error: ' + str(e))
            return None, None, None

# 读取数据
df = pd.read_csv('完整的合併資料.csv')

# 将 'Date' 列转换为 datetime 类型
df['Date'] = pd.to_datetime(df['Date'])

# 确保数据按照 'ticker' 和 'Date' 排序
df = df.sort_values(['ticker', 'Date']).reset_index(drop=True)

# 定义目标变量和特征
target = 'Adj_Close'  # 请替换为您的目标变量名称
features = [col for col in df.columns if col not in ['Date', 'ticker', target]]

def train_and_predict_single_ticker(group_df, shap_threshold=0.85, use_feature_selection=True):
    # 确保数据按照日期排序
    group_df = group_df.sort_values('Date').reset_index(drop=True)

    # 如果数据量太少，无法训练，则跳过
    if len(group_df) < 20:
        print(f"Ticker {group_df['ticker'].iloc[0]} 数据量不足，跳过")
        return None

    # 使用前 80% 的数据作为训练集，后 20% 的数据作为测试集
    split_index = int(len(group_df) * 0.8)
    train = group_df.iloc[:split_index]
    test = group_df.iloc[split_index:]

    if len(test) == 0:
        print(f"Ticker {group_df['ticker'].iloc[0]} 测试集为空，跳过")
        return None

    X_train = train[features]
    y_train = train[target]
    X_test = test[features]
    y_test = test[target]

    # 模型参数（原始模型）
    params = {
        'boosting_type': 'dart',
        'n_estimators': 100,
        'learning_rate': 0.05,
        'n_jobs': -1,
        'random_state': 7,
        'verbose': 0,
        'min_data_in_leaf': 5,
    }

    # 1. 计算 SHAP 值（无论是否进行特征选择）
    try:
        feature_importance_df, shap_values, initial_model = LightGBMModel.shap_feature_importance(
            X_train, y_train, params=params, isClassifier=False)
        if feature_importance_df is not None:
            print(f"Ticker {group_df['ticker'].iloc[0]} - SHAP 计算成功")
        else:
            print(f"Ticker {group_df['ticker'].iloc[0]} - SHAP 计算失败")
    except Exception as e:
        print(f"Ticker {group_df['ticker'].iloc[0]} - SHAP 计算异常: {str(e)}")
        feature_importance_df, shap_values, initial_model = None, None, None

    if use_feature_selection and feature_importance_df is not None:
        try:
            # 计算累计贡献度
            total_importance = feature_importance_df['Importance'].sum()
            feature_importance_df['Cumulative'] = feature_importance_df['Importance'].cumsum() / total_importance
            # 根据阈值选择特征
            important_features = feature_importance_df[feature_importance_df['Cumulative'] <= shap_threshold]['Feature'].tolist()
            print(f"Ticker {group_df['ticker'].iloc[0]} - 选择的特征数量：{len(important_features)}")
            print(f"Ticker {group_df['ticker'].iloc[0]} - 选择的特征：{important_features}")
            selected_features = important_features
            # 筛选 shap_values
            selected_feature_indices = [X_train.columns.get_loc(f) for f in selected_features]
            shap_values_selected = shap_values[:, selected_feature_indices]
        except Exception as e:
            print(f"Ticker {group_df['ticker'].iloc[0]} - 特征筛选失败: {str(e)}")
            selected_features = features.copy()
            shap_values_selected = shap_values
    else:
        # 不进行特征选择，使用所有特征
        selected_features = features.copy()
        shap_values_selected = shap_values
        if not use_feature_selection:
            print(f"Ticker {group_df['ticker'].iloc[0]} - 未进行特征选择，使用所有特征")

    # 使用重要特征进行模型训练（原始模型）
    X_train_important = X_train[selected_features]
    X_test_important = X_test[selected_features]

    # 1. 训练原始模型
    try:
        original_model = lgb.LGBMRegressor(**params, device='gpu')
        original_model.fit(X_train_important, y_train)
        y_pred_original = original_model.predict(X_test_important)
        mae_original = mean_absolute_error(y_test, y_pred_original)
        mape_original = mean_absolute_percentage_error(y_test, y_pred_original) * 100  # 转换为百分比
        mae_original_train = mean_absolute_error(y_train, original_model.predict(X_train_important))
        mape_original_train = mean_absolute_percentage_error(y_train, original_model.predict(X_train_important)) * 100  # 转换为百分比
        print(f"Ticker {group_df['ticker'].iloc[0]} - 原始模型训练集 MAE: {mae_original_train}, MAPE: {mape_original_train}%")
        print(f"Ticker {group_df['ticker'].iloc[0]} - 原始模型测试集 MAE: {mae_original}, MAPE: {mape_original}%")
    except Exception as e:
        print(f"Ticker {group_df['ticker'].iloc[0]} - 原始模型训练失败: {str(e)}")
        return None

    # 2. 参数调优 - 网格搜索
    param_grid = {
        'learning_rate': [0.005, 0.01, 0.03, 0.05, 0.1],
        'max_depth': [3, 5, 7, 9, -1],
        'num_leaves': [31, 63],
        'n_estimators': [100, 200],
        'boosting_type': ['gbdt', 'dart'],
        'random_state': [7],
    }

    best_model_grid, best_params_grid, cv_results_grid = LightGBMModel.grid_tune(
        X_train_important, y_train, fold_time=5, param_grid=param_grid, isClassifier=False)

    if best_model_grid is not None:
        try:
            y_pred_grid = best_model_grid.predict(X_test_important)
            mae_grid = mean_absolute_error(y_test, y_pred_grid)
            mape_grid = mean_absolute_percentage_error(y_test, y_pred_grid) * 100  # 转换为百分比
            mae_grid_train = mean_absolute_error(y_train, best_model_grid.predict(X_train_important))
            mape_grid_train = mean_absolute_percentage_error(y_train, best_model_grid.predict(X_train_important)) * 100  # 转换为百分比
            print(f"Ticker {group_df['ticker'].iloc[0]} - Grid Search 训练集 MAE: {mae_grid_train}, MAPE: {mape_grid_train}%")
            print(f"Ticker {group_df['ticker'].iloc[0]} - Grid Search 测试集 MAE: {mae_grid}, MAPE: {mape_grid}%")
        except Exception as e:
            print(f"Ticker {group_df['ticker'].iloc[0]} - Grid Search 预测失败: {str(e)}")
            mae_grid, mape_grid, mae_grid_train, mape_grid_train = None, None, None, None
    else:
        print(f"Ticker {group_df['ticker'].iloc[0]} - Grid Search 未找到最佳模型，使用原始模型的预测结果。")
        best_model_grid = original_model
        y_pred_grid = y_pred_original
        mae_grid = mae_original
        mape_grid = mape_original
        mae_grid_train = mae_original_train
        mape_grid_train = mape_original_train

    # 3. 参数调优 - 随机搜索
    param_dist = {
        'learning_rate': [0.005, 0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7, 9, -1],
        'num_leaves': [15, 31, 63, 127],
        'n_estimators': [50, 100, 200, 500],
        'min_child_samples': [5, 10, 20, 50],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'lambda_l1': [0, 0.01, 0.1, 1],
        'lambda_l2': [0, 0.01, 0.1, 1],
        'boosting_type': ['gbdt', 'dart'],
        'random_state': [7],
    }

    best_model_random, best_params_random, cv_results_random = LightGBMModel.random_tune(
        X_train_important, y_train, fold_time=5, param_distributions=param_dist, isClassifier=False, n_iter=100)

    if best_model_random is not None:
        try:
            y_pred_random = best_model_random.predict(X_test_important)
            mae_random = mean_absolute_error(y_test, y_pred_random)
            mape_random = mean_absolute_percentage_error(y_test, y_pred_random) * 100  # 转换为百分比
            mae_random_train = mean_absolute_error(y_train, best_model_random.predict(X_train_important))
            mape_random_train = mean_absolute_percentage_error(y_train, best_model_random.predict(X_train_important)) * 100  # 转换为百分比
            print(f"Ticker {group_df['ticker'].iloc[0]} - Random Search 训练集 MAE: {mae_random_train}, MAPE: {mape_random_train}%")
            print(f"Ticker {group_df['ticker'].iloc[0]} - Random Search 测试集 MAE: {mae_random}, MAPE: {mape_random}%")
        except Exception as e:
            print(f"Ticker {group_df['ticker'].iloc[0]} - Random Search 预测失败: {str(e)}")
            mae_random, mape_random, mae_random_train, mape_random_train = None, None, None, None
    else:
        print(f"Ticker {group_df['ticker'].iloc[0]} - Random Search 未找到最佳模型，使用原始模型的预测结果。")
        best_model_random = original_model
        y_pred_random = y_pred_original
        mae_random = mae_original
        mape_random = mape_original
        mae_random_train = mae_original_train
        mape_random_train = mape_original_train

    # 选择 MAE 最小的模型
    mae_values = {}
    if mae_original is not None:
        mae_values['original'] = mae_original
    if mae_grid is not None:
        mae_values['grid_search'] = mae_grid
    if mae_random is not None:
        mae_values['random_search'] = mae_random

    if not mae_values:
        print(f"Ticker {group_df['ticker'].iloc[0]} - 没有有效的 MAE 值，跳过模型选择")
        return None

    best_model_type = min(mae_values, key=mae_values.get)
    best_mae = mae_values[best_model_type]
    print(f"Ticker {group_df['ticker'].iloc[0]} - 选择的最佳模型: {best_model_type}，测试集 MAE: {best_mae}")

    if best_model_type == 'original':
        final_predicted = y_pred_original
    elif best_model_type == 'grid_search':
        final_predicted = y_pred_grid
    else:
        final_predicted = y_pred_random

    # 构建结果字典
    result_dict = {
        'ticker': group_df['ticker'].iloc[0],
        'selected_features': selected_features,
        'original_train_mae': mae_original_train,
        'original_train_mape': mape_original_train,
        'original_test_mae': mae_original,
        'original_test_mape': mape_original,
        'grid_train_mae': mae_grid_train if mae_grid_train is not None else np.nan,
        'grid_train_mape': mape_grid_train if mape_grid_train is not None else np.nan,
        'grid_test_mae': mae_grid if mae_grid is not None else np.nan,
        'grid_test_mape': mape_grid if mape_grid is not None else np.nan,
        'random_train_mae': mae_random_train if mae_random_train is not None else np.nan,
        'random_train_mape': mape_random_train if mape_random_train is not None else np.nan,
        'random_test_mae': mae_random if mae_random is not None else np.nan,
        'random_test_mape': mape_random if mape_random is not None else np.nan,
        'best_model_type': best_model_type,
        'best_mae': best_mae,
        'original_model': original_model,
        'best_model_grid': best_model_grid,
        'best_model_random': best_model_random,
        'y_test': y_test,
        'y_pred_original': y_pred_original,
        'y_pred_grid': y_pred_grid if mae_grid is not None else None,
        'y_pred_random': y_pred_random if mae_random is not None else None,
        'test_dates': test['Date'].values,
        'shap_values': shap_values,
        'shap_feature_names': X_train_important.columns.tolist(),
        'X_train_important': X_train_important,
        'best_params_grid': best_params_grid,
        'best_params_random': best_params_random,
        'cv_results_grid': cv_results_grid,
        'cv_results_random': cv_results_random,
    }

    # 筛选 shap_values 仅包含 selected_features
    if shap_values_selected is not None:
        try:
            # 打印形状信息
            print(f"Ticker {group_df['ticker'].iloc[0]} - shap_values_selected.shape: {shap_values_selected.shape}")
            print(f"Ticker {group_df['ticker'].iloc[0]} - X_train_important.shape: {X_train_important.shape}")

            # 确保 shap_values_selected 的样本数与 X_train_important 一致
            if shap_values_selected.shape[0] != X_train_important.shape[0]:
                print(f"Ticker {group_df['ticker'].iloc[0]} - shap_values_selected 样本数与 X_train_important 不匹配")
                result_dict['shap_values_selected'] = None
            else:
                result_dict['shap_values_selected'] = shap_values_selected
        except Exception as e:
            print(f"Ticker {group_df['ticker'].iloc[0]} - 筛选 shap_values 失败: {str(e)}")
            result_dict['shap_values_selected'] = None
    else:
        result_dict['shap_values_selected'] = None

    # 将最终预测结果保存到结果字典中，包括实际值和最终预测值
    predictions = pd.DataFrame({
        'Date': test['Date'].values,
        'ticker': group_df['ticker'].iloc[0],
        'Actual_Return': y_test.values,
        'Predicted_Return': final_predicted,
        'Best_Model': best_model_type
    })
    result_dict['predictions'] = predictions

    return result_dict

# 初始化结果列表
results = []

# 按照 'ticker' 分组
grouped = df.groupby('ticker')

for name, group in grouped:
    print(f"\n正在处理 Ticker: {name}")
    result = train_and_predict_single_ticker(
        group,
        shap_threshold=0.85,          # 您可以更改 shap_threshold 的值
        use_feature_selection=False    # 设置为 True，进行特征选择；设置为 False，不进行特征选择
    )
    if result is not None:
        results.append(result)

# 将所有预测结果合并
if results:
    all_predictions = pd.concat([res['predictions'] for res in results], ignore_index=True)

    # 确保日期格式正确
    all_predictions['Date'] = pd.to_datetime(all_predictions['Date'])

    # 保存预测结果为 CSV 文件（可选）
    all_predictions.to_csv('predictions.csv', index=False)
    print("\n所有预测结果已保存为 'predictions.csv'")
else:
    print("没有任何预测结果被生成。")

# 绘制 SHAP 图
shap.initjs()

for res in results:
    ticker = res['ticker']
    shap_values_selected = res['shap_values_selected']  # 使用筛选后的 shap_values
    feature_names = res['shap_feature_names']
    X_train_important = res['X_train_important']

    print(f"\n绘制 Ticker {ticker} 的 SHAP 图")

    # 打印 SHAP 值和数据矩阵的形状
    if shap_values_selected is not None:
        print(f"Ticker {ticker} - shap_values_selected.shape: {shap_values_selected.shape}")
        print(f"Ticker {ticker} - X_train_important.shape: {X_train_important.shape}")

    # 绘制 SHAP summary plot
    if shap_values_selected is not None and X_train_important.shape[0] > 0:
        try:
            plt.figure(figsize=(10, 6))
            shap.summary_plot(shap_values_selected, X_train_important, feature_names=feature_names, show=False)
            plt.title(f"Ticker {ticker} - SHAP Summary Plot")
            plt.savefig(f'shap_summary_{ticker}.png', dpi=300)
            plt.close()
            print(f"保存 Ticker {ticker} 的 SHAP Summary Plot 为 'shap_summary_{ticker}.png'")

            # 绘制 SHAP bar plot
            plt.figure(figsize=(10, 6))
            shap.summary_plot(shap_values_selected, X_train_important, feature_names=feature_names, plot_type="bar", show=False)
            plt.title(f"Ticker {ticker} - SHAP Feature Importance")
            plt.savefig(f'shap_bar_{ticker}.png', dpi=300)
            plt.close()
            print(f"保存 Ticker {ticker} 的 SHAP Feature Importance 图为 'shap_bar_{ticker}.png'")
        except Exception as e:
            print(f"Ticker {ticker} 的 SHAP 绘图失败: {str(e)}")
    else:
        print(f"Ticker {ticker} 的 SHAP 值计算失败，无法绘制 SHAP 图")

    # 绘制预测值与真实值的比较图
    predictions = res['predictions']
    if predictions is not None and not predictions.empty:
        plt.figure(figsize=(12, 6))
        plt.plot(predictions['Date'], predictions['Actual_Return'], label='Actual Return', marker='o')
        plt.plot(predictions['Date'], predictions['Predicted_Return'], label='Predicted Return', marker='x')
        plt.xlabel('Date')
        plt.ylabel('Return')
        plt.title(f'Ticker {ticker} - Actual vs Predicted Returns')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f'actual_vs_predicted_{ticker}.png', dpi=300)
        plt.close()
        print(f"保存 Ticker {ticker} 的 Actual vs Predicted Returns 图为 'actual_vs_predicted_{ticker}.png'")
    else:
        print(f"Ticker {ticker} 没有足够的数据绘制 Actual vs Predicted Returns 图")

    # 绘制预测误差分布图（可选）
    if predictions is not None and not predictions.empty:
        plt.figure(figsize=(10, 6))
        error = predictions['Actual_Return'] - predictions['Predicted_Return']
        plt.hist(error, bins=30, edgecolor='k', alpha=0.7)
        plt.xlabel('Prediction Error (Actual - Predicted)')
        plt.ylabel('Frequency')
        plt.title(f'Ticker {ticker} - Prediction Error Distribution')
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f'prediction_error_distribution_{ticker}.png', dpi=300)
        plt.close()
        print(f"保存 Ticker {ticker} 的 Prediction Error Distribution 图为 'prediction_error_distribution_{ticker}.png'")

# 输出每个 ticker 的 MAE、MAPE 和最佳模型类型
for res in results:
    ticker = res['ticker']
    print(f"\n=== {ticker} 的模型性能 ===")
    print(f"原始模型训练集 MAE: {res['original_train_mae']}, MAPE: {res['original_train_mape']}%")
    print(f"原始模型测试集 MAE: {res['original_test_mae']}, MAPE: {res['original_test_mape']}%")
    print(f"Grid Search 调优模型训练集 MAE: {res['grid_train_mae']}, MAPE: {res['grid_train_mape']}%")
    print(f"Grid Search 调优模型测试集 MAE: {res['grid_test_mae']}, MAPE: {res['grid_test_mape']}%")
    print(f"Random Search 调优模型训练集 MAE: {res['random_train_mae']}, MAPE: {res['random_train_mape']}%")
    print(f"Random Search 调优模型测试集 MAE: {res['random_test_mae']}, MAPE: {res['random_test_mape']}%")
    print(f"最佳模型类型: {res['best_model_type']}")
    print(f"最佳模型测试集 MAE: {res['best_mae']}")

