# 股票预测模型工作流

---
### 工作流说明
1.  **阶段零 (Setup)**: 导入库、加载配置。
2.  **阶段一 (Data Pipeline)**: 独立运行。负责处理并保存数据，生成 L2 特征数据缓存。
3.  **阶段二 (Model Pipeline)**: 独立运行。包含三个子步骤：
    - **2.1 HPO**: 自动调参。
    - **2.2 (预处理)**: 智能地加载或生成 L3 预处理数据缓存
    - **2.3 (模型训练)**: 使用 L3 缓存进行高效的模型训练。
    - **2.4 (评估)**: 对训练结果进行聚合与可视化。

## 0. 通用设置与导入

In [1]:
import os, sys, yaml, torch, joblib, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
from pathlib import Path
from tqdm.autonotebook import tqdm
from sklearn.preprocessing import StandardScaler

os.environ['PYOPENCL_CTX'] = '0'
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

try:
    from data_process.get_data import initialize_apis, shutdown_apis
    from data_process.save_data import run_data_pipeline, get_processed_data_path
    from model_builders.build_models import run_training_for_ticker, _walk_forward_split
    from model_builders.hpo_utils import run_hpo_for_ticker
    print("INFO: 项目模型导入成功.")
except ImportError as e:
    print(f"WARNNING: 导入失败: {e}. 添加项目根目录...")
    project_root = str(Path().resolve()); sys.path.append(project_root) if project_root not in sys.path else None
    from data_process.get_data import initialize_apis, shutdown_apis
    from data_process.save_data import run_data_pipeline, get_processed_data_path
    from model_builders.build_models import run_training_for_ticker, _walk_forward_split
    from model_builders.hpo_utils import run_hpo_for_ticker
    print("INFO: 导入成功.")

CONFIG_PATH = 'configs/config.yaml'
try:
    with open(CONFIG_PATH, 'r', encoding='utf-8') as f: config = yaml.safe_load(f)
    print(f"SUCCESS: 从 '{CONFIG_PATH}' 加载 Config.")
except FileNotFoundError:
    print(f"ERROR: 未找到 Config."); config = {}

if config:
    global_settings, strategy_config, hpo_config, default_model_params, stocks_to_process = (
        config.get('global_settings', {}), config.get('strategy_config', {}), 
        config.get('hpo_config', {}), config.get('default_model_params', {}), 
        config.get('stocks_to_process', [])
    )

  from tqdm.autonotebook import tqdm


INFO: 项目模型导入成功.
SUCCESS: 从 'configs/config.yaml' 加载 Config.


# **阶段一：数据准备与特征工程**

In [2]:
print("--- 开始步骤 1: 数据准备 ---\n")
try:
    if config: initialize_apis(config); run_data_pipeline(config_path=CONFIG_PATH)
    else: print("ERROR: Config 为空.")
finally:
    shutdown_apis()

--- 开始步骤 1: 数据准备 ---

INFO: 尝试登陆 Baostock...
login success!
INFO: Baostock API 登录成功。SDK版本: 00.8.90
INFO: 未在配置中提供有效的 Tushare Token。将跳过宏观数据获取。
开始执行数据管道协调任务...
将使用配置文件: configs/config.yaml

需要为以下 7 只股票生成新数据: ['600519.SH', '000001.SZ', '688256.SH', '601606.SH', '000681.SZ', '603099.SH', '000100.SZ']
--- 开始批量特征生成 ---
针对特定股票: 7 生成特征.

--- Generating features for 贵州茅台 (600519.SH) ---
  - Running in Training Mode: Fetching historical data based on config.
  - Data window: Requesting data from 2015-08-31 to 2025-08-31.
  - [1/7] 正在从 Baostock 下载 sh.600519 的日线行情...
  - INFO: 已将 sh.600519 的原始数据缓存至 data_cache\raw_ohlcv\raw_sh.600519_2015-08-31_2025-08-31.pkl
  - INFO: Received data for 贵州茅台 from 2015-08-31 to 2025-08-29.
INFO: Starting feature calculation pipeline...
  - [Calculating Features] Running: Technical Indicators...
    - Calculated: ema with params {'length': 10}
    - Calculated: ema with params {'length': 30}
    - Calculated: rsi with params {'length': 14}
    - Calculated: macd with 

# **阶段二：模型训练与评估**

### 2.1 数据预加载与全局预处理 (L3 缓存)

In [3]:
print("--- 开始步骤 2.1: 数据预加载与全局预处理 ---\n")

L3_CACHE_DIR = Path(global_settings.get('output_dir', 'data/processed'))
L3_CACHE_DIR.mkdir(parents=True, exist_ok=True)
# 缓存文件名可以包含配置的哈希，以确保在配置更改后缓存能自动失效
# (为了简单起见，暂时使用固定文件名，手动删除即可更新)
L3_CACHE_PATH = L3_CACHE_DIR / "_preprocessed_cache.joblib"

global_data_cache = {}
FORCE_REPROCESS = False # 如果你想强制重新生成 L3 缓存，请设为 True

# --- 核心修正 2：检查 L3 缓存是否存在 ---
if L3_CACHE_PATH.exists() and not FORCE_REPROCESS:
    print(f"INFO: F已找到 L3 预处理数据缓存. 将从 {L3_CACHE_PATH} 加载...")
    try:
        global_data_cache = joblib.load(L3_CACHE_PATH)
        print("SUCCESS: L3 缓存已成功加载至内存.")
    except Exception as e:
        print(f"WARNNING: 加载 L3 缓存失败: {e}. 将开始预处理数据.")
        global_data_cache = {}

if not global_data_cache: # 如果缓存为空（不存在或加载失败），则执行预处理
    print("INFO: L3 缓存未找到或为空. 开始预处理...\n")
    if config and stocks_to_process:
        # 导入 LSTM builder 以便调用其方法
        from model_builders.lstm_builder import LSTMBuilder
        lstm_builder_for_preprocessing = LSTMBuilder(config)
        
        for stock_info in tqdm(stocks_to_process, desc="Pre-processing Stocks"):
            ticker = stock_info.get('ticker'); keyword = stock_info.get('keyword', ticker)
            if not ticker: continue
            data_path = get_processed_data_path(stock_info, config)
            if not data_path.exists(): print(f"\nERROR: 未找到 {keyword} 的数据."); continue
            df = pd.read_pickle(data_path); df.index.name = 'date'
            folds = _walk_forward_split(df, strategy_config)
            if not folds: print(f"\nWARNNING: 未找到 {keyword} 的 folds."); continue

            preprocessed_folds_lgbm = []; preprocessed_folds_lstm = []
            label_col = global_settings.get('label_column', 'label_return')
            features_for_model = [c for c in df.columns if c != label_col]

            for train_df, val_df in folds:
                X_train_model, y_train = train_df[features_for_model], train_df[label_col]
                X_val_model, y_val = val_df[features_for_model], val_df[label_col]
                scaler_lgbm = StandardScaler()
                X_train_scaled = pd.DataFrame(scaler_lgbm.fit_transform(X_train_model), index=X_train_model.index, columns=features_for_model)
                X_val_scaled = pd.DataFrame(scaler_lgbm.transform(X_val_model), index=X_val_model.index, columns=features_for_model)
                preprocessed_folds_lgbm.append({'X_train_scaled': X_train_scaled, 'y_train': y_train, 'X_val_scaled': X_val_scaled, 'y_val': y_val, 'scaler': scaler_lgbm})

                if 'lstm' in global_settings.get('models_to_train', []):
                    scaler_lstm = StandardScaler()
                    train_df_scaled, val_df_scaled = train_df.copy(), val_df.copy()
                    train_df_scaled[features_for_model] = scaler_lstm.fit_transform(train_df[features_for_model])
                    val_df_scaled[features_for_model] = scaler_lstm.transform(val_df[features_for_model])
                    X_train_seq, y_train_seq, _ = lstm_builder_for_preprocessing._create_sequences(train_df_scaled, features_for_model)
                    X_val_seq, y_val_seq, dates_val_seq = lstm_builder_for_preprocessing._create_sequences(val_df_scaled, features_for_model)
                    preprocessed_folds_lstm.append({'X_train_tensor': torch.from_numpy(X_train_seq), 'y_train_tensor': torch.from_numpy(y_train_seq).unsqueeze(1), 'X_val_tensor': torch.from_numpy(X_val_seq), 'y_val_tensor': torch.from_numpy(y_val_seq).unsqueeze(1), 'y_val_seq': y_val_seq, 'dates_val_seq': dates_val_seq, 'scaler': scaler_lstm})
            
            global_data_cache[ticker] = {'full_df': df, 'lgbm_folds': preprocessed_folds_lgbm, 'lstm_folds': preprocessed_folds_lstm}
            # print(f"  - Cached {len(preprocessed_folds_lgbm)} folds for {keyword}.")
        
        print(f"\nINFO: 预处理完成. L3 缓存将保存至 {L3_CACHE_PATH}...")
        try:
            joblib.dump(global_data_cache, L3_CACHE_PATH)
            print("SUCCESS: L3 缓存 保存成功.")
        except Exception as e:
            print(f"ERROR: 保存 L3 缓存失败: {e}")

print("\n--- 步骤 2.1 完成: 所有数据已加载至内存. ---")

--- 开始步骤 2.1: 数据预加载与全局预处理 ---

INFO: L3 缓存未找到或为空. 开始预处理...

INFO: PyTorch LSTMBuilder will use device: CUDA


Pre-processing Stocks:   0%|          | 0/7 [00:00<?, ?it/s]


ERROR: 未找到 寒武纪-U 的数据.

INFO: 预处理完成. L3 缓存将保存至 data\processed\_preprocessed_cache.joblib...
SUCCESS: L3 缓存 保存成功.

--- 步骤 2.1 完成: 所有数据已加载至内存. ---


### 2.2 超参数优化

In [None]:
RUN_HPO = True # 设为 True 以运行优化，False 则跳过
HPO_TRIALS = hpo_config.get('n_trials', 50)

if RUN_HPO and config:
    hpo_tickers = hpo_config.get('tickers_for_hpo', [])
    
    if not hpo_tickers:
        print("INFO: 在配置文件中未指定用于 HPO 的股票，跳过此步骤。")
    elif 'global_data_cache' not in locals() or not global_data_cache:
        print("ERROR: 全局数据缓存 (global_data_cache) 为空。请先成功运行 2.1 预处理单元格。")
    else:
        print(f"--- 开始为以下股票进行超参数优化: {hpo_tickers} ---\n")
        model_type_for_hpo = 'lgbm'
        hpo_results_list = []

        for ticker in hpo_tickers:
            stock_info = next((s for s in stocks_to_process if s['ticker'] == ticker), None)
            if not stock_info:
                print(f"WARNNING: 未在 'stocks_to_process' 中找到 HPO 股票 {ticker} 的配置。跳过。")
                continue
            
            keyword = stock_info.get('keyword', ticker)

            # --- 核心：从 L3 缓存 (global_data_cache) 中为 HPO 准备数据 ---
            if ticker not in global_data_cache:
                print(f"ERROR: 预处理数据缓存中未找到 {keyword} 的数据。请先成功运行 2.1 单元格。跳过。")
                continue

            all_preprocessed_folds = global_data_cache[ticker].get(f'{model_type_for_hpo}_folds', [])
            if not all_preprocessed_folds:
                print(f"WARNNING: 缓存中未找到 {keyword} 的 '{model_type_for_hpo}' 预处理数据。跳过 HPO。")
                continue

            num_eval_folds = hpo_config.get('hpo_num_eval_folds', 2)
            hpo_folds_data = all_preprocessed_folds[-num_eval_folds:]
            print(f"INFO: 已从 L3 缓存中为 {keyword} 加载最后 {len(hpo_folds_data)} 个预处理 fold 用于 HPO。")

            hpo_run_config = {
                'global_settings': global_settings, 'strategy_config': strategy_config,
                'default_model_params': default_model_params, 'stocks_to_process': [stock_info],
                'hpo_config': hpo_config
            }
            
            best_params, best_value = run_hpo_for_ticker(
                preprocessed_folds=hpo_folds_data,
                ticker=ticker,
                config=hpo_run_config,
                model_type=model_type_for_hpo,
                n_trials=HPO_TRIALS
            )
            
            if best_params:
                hpo_results_list.append({'ticker': ticker, 'keyword': keyword, 'best_score': best_value, **best_params})
        
        if hpo_results_list:
            PARAM_MAP_CN = {
                'best_score': '最佳分数 (ICIR)', 'num_leaves': '叶子节点数', 
                'learning_rate': '学习率', 'min_child_samples': '叶节点最小样本数', 
                'feature_fraction': '特征采样比例', 'bagging_fraction': '数据采样比例', 
                'reg_alpha': 'L1正则化', 'reg_lambda': 'L2正则化'
            }
            
            hpo_results_df = pd.DataFrame(hpo_results_list).set_index(['ticker', 'keyword'])
            hpo_results_df.rename(columns={k: v for k, v in PARAM_MAP_CN.items() if k in hpo_results_df.columns}, inplace=True)
            
            print("\n" + "="*80)
            print("--- HPO 详细结果汇总 ---")
            display(hpo_results_df.style.format({'最佳分数 (ICIR)': '{:.4f}'}).background_gradient(cmap='viridis', subset=['最佳分数 (ICIR)']))
            
            hpo_log_dir = Path("hpo_logs"); hpo_log_dir.mkdir(exist_ok=True)
            timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
            hpo_results_path = hpo_log_dir / f"hpo_results_{timestamp}.csv"
            hpo_results_df.to_csv(hpo_results_path, encoding='utf-8-sig')
            print(f"成功: HPO 详细结果已保存至 {hpo_results_path}")
            print("="*80)

            param_cols_original = [c for c in hpo_results_list[0].keys() if c not in ['ticker', 'keyword', 'best_score']]
            final_hpo_params_df = pd.DataFrame(hpo_results_list)
            
            final_hpo_params = final_hpo_params_df[param_cols_original].mean().to_dict()
            average_best_score = final_hpo_params_df['best_score'].mean()
            
            for p in ['num_leaves', 'min_child_samples']:
                if p in final_hpo_params: final_hpo_params[p] = int(round(final_hpo_params[p]))
            
            config['default_model_params']['lgbm_params'].update(final_hpo_params)
            default_model_params['lgbm_params'] = config['default_model_params']['lgbm_params']
            
            print("\n" + "="*80)
            print("--- HPO 综合结果 ---")
            print(f"本轮 HPO 平均最高分 (ICIR): {average_best_score:.4f}")
            print("将用于后续训练的【平均参数】如下:")
            print(yaml.dump(default_model_params['lgbm_params'], allow_unicode=True))
            print("="*80)
else:
    print("INFO: 跳过 HPO 步骤。模型将使用配置文件中的默认参数进行训练。")

[I 2025-10-13 08:16:06,068] A new study created in memory with name: no-name-9a2bf01f-72c0-4969-8962-075aee1072e8


--- 开始为以下股票进行超参数优化: ['603099.SH', '000100.SZ', '600519.SH'] ---

INFO: 已从 L3 缓存中为 长白山 加载最后 2 个预处理 fold 用于 HPO。

--- 开始为 长白山 (603099.SH) 进行 HPO (共 50 轮) ---


  0%|          | 0/50 [00:00<?, ?it/s]

    - Quantile 0.05: Finished. Best iter: [65]
    - Quantile 0.5: Finished. Best iter: [65]
    - Quantile 0.95: Finished. Best iter: [1]
    - Quantile 0.05: Finished. Best iter: [21]
    - Quantile 0.5: Finished. Best iter: [6]
    - Quantile 0.95: Finished. Best iter: [6]
[I 2025-10-13 08:16:10,094] Trial 0 finished with value: 25.65356749955836 and parameters: {'num_leaves': 25, 'learning_rate': 0.07969454818643935, 'min_child_samples': 47, 'feature_fraction': 0.8394633936788146, 'bagging_fraction': 0.6624074561769746, 'reg_alpha': 0.029375384576328288, 'reg_lambda': 0.014936568554617643}. Best is trial 0 with value: 25.65356749955836.
    - Quantile 0.05: Finished. Best iter: [65]
    - Quantile 0.5: Finished. Best iter: [65]
    - Quantile 0.95: Finished. Best iter: [1]
    - Quantile 0.05: Finished. Best iter: [21]
    - Quantile 0.5: Finished. Best iter: [6]
    - Quantile 0.95: Finished. Best iter: [6]
[I 2025-10-13 08:16:12,149] Trial 1 finished with value: 25.65356749955836

[I 2025-10-13 08:17:49,774] A new study created in memory with name: no-name-813f8036-49fa-4a0c-aff4-311feb63b5e5


    - Quantile 0.95: Finished. Best iter: [6]
[I 2025-10-13 08:17:49,770] Trial 49 finished with value: 25.65356749955836 and parameters: {'num_leaves': 23, 'learning_rate': 0.07675021977087017, 'min_child_samples': 41, 'feature_fraction': 0.8155444268107479, 'bagging_fraction': 0.9173343013721267, 'reg_alpha': 0.011548875296540941, 'reg_lambda': 0.10614818269958574}. Best is trial 0 with value: 25.65356749955836.

--- 长白山 (603099.SH) 的 HPO 结果 ---
最佳分数 (ICIR): 25.6536
最佳参数组合:
  叶子节点数: 25
  学习率: 0.07969454818643935
  叶节点最小样本数: 47
  特征采样比例: 0.8394633936788146
  数据采样比例: 0.6624074561769746
  L1正则化: 0.029375384576328288
  L2正则化: 0.014936568554617643
INFO: 已从 L3 缓存中为 TCL科技 加载最后 2 个预处理 fold 用于 HPO。

--- 开始为 TCL科技 (000100.SZ) 进行 HPO (共 50 轮) ---


  0%|          | 0/50 [00:00<?, ?it/s]

    - Quantile 0.05: Finished. Best iter: [116]
    - Quantile 0.5: Finished. Best iter: [116]
    - Quantile 0.95: Finished. Best iter: [41]
    - Quantile 0.05: Finished. Best iter: [2]
    - Quantile 0.5: Finished. Best iter: [2]
    - Quantile 0.95: Finished. Best iter: [2]
[I 2025-10-13 08:17:52,180] Trial 0 finished with value: 6.095756196590693 and parameters: {'num_leaves': 25, 'learning_rate': 0.07969454818643935, 'min_child_samples': 47, 'feature_fraction': 0.8394633936788146, 'bagging_fraction': 0.6624074561769746, 'reg_alpha': 0.029375384576328288, 'reg_lambda': 0.014936568554617643}. Best is trial 0 with value: 6.095756196590693.
    - Quantile 0.05: Finished. Best iter: [116]
    - Quantile 0.5: Finished. Best iter: [93]
    - Quantile 0.95: Finished. Best iter: [41]
    - Quantile 0.05: Finished. Best iter: [2]
    - Quantile 0.5: Finished. Best iter: [2]
    - Quantile 0.95: Finished. Best iter: [2]
[I 2025-10-13 08:17:54,496] Trial 1 finished with value: 5.582482312858

### 2.3 模型训练

In [None]:
FORCE_RETRAIN = False
all_ic_history = []

print("--- 开始模型训练 ---\\n")
if config and stocks_to_process:
    models_to_train = global_settings.get('models_to_train', ['lgbm', 'lstm'])
    stock_iterator = tqdm(stocks_to_process, desc="Processing Stocks")

    for stock_info in stock_iterator:
        ticker = stock_info.get('ticker')
        if not ticker or ticker not in global_data_cache:
            continue
        
        keyword = stock_info.get('keyword', ticker)
        stock_iterator.set_description(f"Processing {keyword}")
        
        # 从缓存中获取该股票的所有数据
        cached_stock_data = global_data_cache[ticker]
        full_df = cached_stock_data['full_df']
        
        for model_type in models_to_train:
            # --- 核心修正：根据 model_type 动态构建正确的键名 ---
            folds_key = f"{model_type}_folds"
            preprocessed_folds = cached_stock_data.get(folds_key)
            
            if not preprocessed_folds:
                print(f"\\nWARNNING: 未找到 {keyword} 模型 '{model_type}' 的预处理 folds. 跳过.")
                continue
            # --- 修正结束 ---

            run_config = {
                'global_settings': global_settings, 'strategy_config': strategy_config,
                'default_model_params': default_model_params, 'stocks_to_process': [stock_info],
                'full_df_for_final_model': full_df
            }

            ic_history = run_training_for_ticker(
                preprocessed_folds=preprocessed_folds,
                ticker=ticker,
                model_type=model_type,
                config=run_config, 
                force_retrain=FORCE_RETRAIN,
                keyword=keyword
            )
            
            if ic_history is not None and not ic_history.empty:
                all_ic_history.append(ic_history)
else:
    print("ERROR: Config 或 stocks_to_process 为空.")

### 2.4 结果聚合、评估与可视化

In [None]:
print("\n--- 开始步骤 2.4: 结果聚合、评估与可视化 ---")
if all_ic_history:
    full_ic_df = pd.concat(all_ic_history)
    full_ic_df['ticker_name'] = full_ic_df['ticker'].map({s['ticker']: s.get('keyword', s['ticker']) for s in stocks_to_process})
    
    # 聚合评估结果
    evaluation_summary = full_ic_df.groupby(['ticker_name', 'model_type'])['rank_ic'].agg(['mean', 'std']).reset_index()
    evaluation_summary['icir'] = evaluation_summary['mean'] / evaluation_summary['std']
    
    # --- 1. 打印和显示评估表格 ---
    print("\n--- 模型性能评估总结 ---")
    display(evaluation_summary.style.format({
        'mean': '{:.4f}', 'std': '{:.4f}', 'icir': '{:.4f}'
    }).background_gradient(cmap='viridis', subset=['icir']))

    # --- 2. 绘制 ICIR 对比图 ---
    plt.figure(figsize=(12, 6))
    sns.barplot(data=evaluation_summary, x='ticker_name', y='icir', hue='model_type')
    plt.title('模型信息比率 (ICIR) 对比', fontsize=16)
    plt.xlabel('股票', fontsize=12)
    plt.ylabel('ICIR (信息比率)', fontsize=12)
    plt.axhline(0, color='grey', linestyle='--')
    plt.axhline(0.5, color='red', linestyle='--', label='ICIR=0.5 (良好)')
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

    # --- 3. 绘制累积 IC 曲线图 ---
    plot_df = full_ic_df.copy()
    plot_df['date'] = pd.to_datetime(plot_df['date'])
    plot_df.sort_values('date', inplace=True)
    plot_df['cumulative_ic'] = plot_df.groupby(['ticker_name', 'model_type'])['rank_ic'].cumsum()
    
    plt.figure(figsize=(14, 8))
    sns.lineplot(data=plot_df, x='date', y='cumulative_ic', hue='ticker_name', style='model_type', marker='o', markersize=4, linestyle='--')
    plt.title('模型累积 Rank IC 曲线', fontsize=16)
    plt.xlabel('日期', fontsize=12)
    plt.ylabel('累积 Rank IC', fontsize=12)
    plt.legend(title='股票/模型')
    plt.tight_layout()
    plt.show()

else:
    print("\nWARNNING: 训练期间未生成 IC 历史。跳过汇总和评估.")