In [1]:
import connectorx as cx
import pandas as pd
import numpy as np
import os
import duckdb
from datetime import datetime, timedelta
import glob
import multiprocessing 
from functools import partial
import time
import traceback

In [2]:
duckdb.query("SET worker_threads=1")

In [None]:
import os
import glob
import logging
import pandas as pd
import numpy as np
import duckdb
from datetime import datetime, timedelta

# 设置日志
logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)


def calculate_minute_level_apb(trading_date):
    """计算分钟级APB因子"""

    trading_date = pd.to_datetime(trading_date)

    date_str = trading_date.strftime('%Y%m%d')
    
    order_pth = f"/data/cephfs/transaction/{date_str}.parquet"
    
    # 确保输出目录存在
    output_dir = "./factors/Transaction_minute_level_APB"
    os.makedirs(output_dir, exist_ok=True)

    output_file = f"{output_dir}/{date_str}.parquet"
    
    if not os.path.exists(order_pth):
        return
        # raise FileNotFoundError(f"订单数据文件不存在: {order_pth}")
    
    # 如果该日期的文件已存在，跳过处理
    if os.path.exists(output_file):
        # print(f"日期 {date_str} 已处理，跳过")
        return

    conn = duckdb.connect(database=':memory:')

    # 1. 读取毫秒级订单簿数据并转换为分钟级
    query_order = f"""
        WITH buy_orders_with_time AS(
            SELECT 
                security_code,
                order_side, 
                order_type,
                order_details,
                order_price,
                order_price_adj,
                order_volume,
                order_time,
                ('{date_str}'::DATE + MAKE_TIME(
                    FLOOR(order_time / 10000000)::int,
                    (FLOOR(order_time / 100000) % 100)::int,
                    (FLOOR(order_time / 1000) % 100)::numeric + 
                    (order_time % 1000)::numeric / 1000)
                )::TIMESTAMP AS formatted_time
            FROM '{order_pth}'
            WHERE order_side = 1
                AND order_type = 'A'
                AND order_details = 'L'
                AND order_price > 0
                AND order_volume > 0
                AND order_time >= 93000000
                AND order_time < 145700000
        )
        SELECT
            security_code,
            order_price,
            order_volume,
            DATE_TRUNC('minute', formatted_time) AS order_minute
        FROM buy_orders_with_time
        """

    # 将数据加载到临时表中
    conn.execute(f"CREATE TEMPORARY TABLE minute_order AS {query_order}")

    # 2. 计算每分钟的VWAP和TWAP
    minute_apb_query = """
        WITH minute_level_stats AS (
            -- 直接计算每分钟的VWAP和TWAP
            SELECT 
                security_code,
                order_minute,
                SUM(order_volume * order_price) / SUM(order_volume) AS vwap,
                AVG(order_price) AS twap,
                SUM(order_volume) AS total_volume
            FROM minute_order
            GROUP BY security_code, order_minute
        )
        -- 计算APB因子
        SELECT 
            '{date_str}' AS date_str,
            security_code,
            order_minute,
            vwap,
            twap,
            CASE 
                WHEN twap > 0 AND vwap > 0 THEN LN(twap / vwap)
                ELSE NULL
            END AS apb,
            (twap - vwap) / vwap AS apb_alt,
            total_volume
        FROM minute_level_stats
        WHERE total_volume > 0
        ORDER BY security_code
    """

    # 3. 保存分钟级数据
    minute_apb_df = conn.execute(minute_apb_query).fetchdf()
    
    # 保存分钟级数据
    minute_apb_df.to_parquet(output_file)
    
    return minute_apb_df

def process_all_trading_days(trading_dates):
    """处理所有交易日的数据"""
    for date in trading_dates:
        try:
            calculate_minute_level_apb(date)
            # print(f"成功处理日期: {date}")
        except Exception as e:
            print(f"处理日期 {date} 时出错: {str(e)}")



# def calculate_rolling_average(window_size=1200, daily_min=400,start_date=None, end_date=None):
#     """计算滚动平均APB因子，采用分批处理方式"""
#     data_dir = "./factors/minute_level_APB"  
#     parquet_files = sorted(glob.glob(os.path.join(data_dir, "*.parquet")))

#     if not parquet_files:
#         raise ValueError("未找到分钟级数据文件")

#     logger.info(f"找到 {len(parquet_files)} 个分钟级数据文件")
    
#     # 假设文件名是日期格式，如 "20210602.parquet"
#     # 提取文件名中的日期并排序
#     file_dates = []
#     for file_path in parquet_files:
#         file_name = os.path.basename(file_path)
#         date_str = file_name.split('.')[0]  # 假设文件名格式为 "YYYYMMDD.parquet"
#         file_dates.append((date_str, file_path))
    
#     # 按日期排序
#     file_dates.sort(key=lambda x: x[0])
    
#     # 如果指定了日期范围，筛选文件
#     if start_date and end_date:
#         file_dates = [(date, path) for date, path in file_dates 
#                       if start_date <= date <= end_date]
    
#     if not file_dates:
#         raise ValueError("筛选后没有符合条件的数据文件")
    
#     output_dir = "./factors/minute_level_APB/1200_Rolling"

#     # 确保输出目录存在
#     os.makedirs(output_dir, exist_ok=True)

#     for i in range(5,len(file_dates)):  # 从第6个文件开始，确保有前5天的数据
#         current_date, current_file = file_dates[i]
#         logger.info(f"处理日期 {current_date} 的数据")
        
#         # 检查输出文件是否已存在
#         output_file = os.path.join(output_dir, f"{current_date}.parquet")
#         if os.path.exists(output_file):
#             logger.info(f"日期 {current_date} 的结果文件已存在，跳过处理")
#             continue
        
#         # 获取当前日期和前5天的文件
#         start_idx = max(0, i-5)  # 确保不会索引到负数
#         recent_files = [path for _, path in file_dates[start_idx:i+1]]
        
#         # 读取这些天的数据
#         dfs = []
#         for file_path in recent_files:
#             try:
#                 df = pd.read_parquet(file_path)
#                 # 修复date_str列
#                 df['date_str'] = df['order_minute'].dt.strftime('%Y%m%d')
#                 # 添加分钟时间
#                 df['order_minute'] = df['order_minute'].dt.strftime('%H:%M')
#                 dfs.append(df)
#             except Exception as e:
#                 logger.error(f"读取文件 {file_path} 时出错: {str(e)}")
#                 continue
        
#         if not dfs:
#             logger.warning(f"日期 {current_date} 没有有效数据，跳过")
#             continue
        
#         # 合并数据
#         combined_data = pd.concat(dfs, ignore_index=True)
        
#         try:
#             # 只处理当前日期的结果
#             # 使用透视表方式计算滚动平均
#             # 先创建一个唯一的时间索引，确保数据按时间顺序排列
#             combined_data.sort_values(['security_code', 'date_str', 'order_minute'], inplace=True)
#             combined_data['time_idx'] = combined_data.groupby('security_code').cumcount()
            
#             # 为每个股票创建透视表
#             pivot_data = combined_data.pivot_table(
#                 index='time_idx', 
#                 columns='security_code', 
#                 values='apb',
#                 aggfunc='first'  # 如果有重复的time_idx，取第一个值
#             )
            
#             # 计算滚动平均
#             rolling_avg = pivot_data.rolling(window=window_size, min_periods=daily_min).mean()
            
#             # 将结果转回长格式
#             rolling_result = rolling_avg.stack().reset_index()
#             rolling_result.columns = ['time_idx', 'security_code', 'rolling_apb']
            
#             # 与原始数据合并，获取日期信息
#             rolling_result = pd.merge(
#                 rolling_result,
#                 combined_data[['security_code', 'time_idx', 'date_str', 'order_minute']],
#                 on=['security_code', 'time_idx'],
#                 how='left'
#             )
            
#             # 只保留当前日期的数据
#             current_date_result = rolling_result[rolling_result['date_str'] == current_date].copy()
            
#             # 计算每日平均值
#             daily_factor = current_date_result.groupby(['date_str', 'security_code'])['rolling_apb'].mean().reset_index()

#             daily_factor.columns = ['date', 'security_code', 'rolling_apb']
            
#             # 保存当天的结果到parquet文件
#             daily_factor.to_parquet(output_file)
#             logger.info(f"已保存日期 {current_date} 的结果到 {output_file}")
            
            
#             # 清理内存
#             del combined_data, pivot_data, rolling_avg, rolling_result, current_date_result, daily_factor
#             import gc
#             gc.collect()
            
#         except Exception as e:
#             logger.error(f"处理日期 {current_date} 时发生错误: {str(e)}")
#             import traceback
#             logger.error(traceback.format_exc())
    
    # return output_dir



In [11]:
import concurrent.futures
import os
import glob
import pandas as pd
import logging
import traceback
import gc
from functools import partial

def process_single_date(date_info, window_size=1200, min_periods=400, output_dir="./factors/minute_level_APB/1200_Rolling"):
    """处理单个日期的数据"""
    current_date, current_file, recent_files = date_info
    
    # 设置每个线程的logger
    thread_logger = logging.getLogger(f"thread_{current_date}")
    if not thread_logger.handlers:
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        thread_logger.addHandler(handler)
        thread_logger.setLevel(logging.INFO)
    
    thread_logger.info(f"开始处理日期 {current_date} 的数据")
    
    # 检查输出文件是否已存在
    output_file = os.path.join(output_dir, f"{current_date}.parquet")
    if os.path.exists(output_file):
        thread_logger.info(f"日期 {current_date} 的结果文件已存在，跳过处理")
        return current_date, True, "文件已存在"
    
    try:
        # 读取这些天的数据
        dfs = []
        for file_path in recent_files:
            try:
                df = pd.read_parquet(file_path)
                # 修复date_str列
                df['date_str'] = df['order_minute'].dt.strftime('%Y%m%d')
                # 添加分钟时间
                df['order_minute'] = df['order_minute'].dt.strftime('%H:%M')
                dfs.append(df)
            except Exception as e:
                thread_logger.error(f"读取文件 {file_path} 时出错: {str(e)}")
                continue
        
        if not dfs:
            thread_logger.warning(f"日期 {current_date} 没有有效数据，跳过")
            return current_date, False, "没有有效数据"
        
        # 合并数据
        combined_data = pd.concat(dfs, ignore_index=True)
        
        # 使用透视表方式计算滚动平均
        # 先创建一个唯一的时间索引，确保数据按时间顺序排列
        combined_data.sort_values(['security_code', 'date_str', 'order_minute'], inplace=True)
        combined_data['time_idx'] = combined_data.groupby('security_code').cumcount()
        
        # 为每个股票创建透视表
        pivot_data = combined_data.pivot_table(
            index='time_idx', 
            columns='security_code', 
            values='apb',
            aggfunc='first'  # 如果有重复的time_idx，取第一个值
        )
        
        # 计算滚动平均
        rolling_avg = pivot_data.rolling(window=window_size, min_periods=min_periods).mean()
        
        # 将结果转回长格式
        rolling_result = rolling_avg.stack().reset_index()
        rolling_result.columns = ['time_idx', 'security_code', 'rolling_apb']
        
        # 与原始数据合并，获取日期信息
        rolling_result = pd.merge(
            rolling_result,
            combined_data[['security_code', 'time_idx', 'date_str', 'order_minute']],
            on=['security_code', 'time_idx'],
            how='left'
        )
        
        # 只保留当前日期的数据
        current_date_result = rolling_result[rolling_result['date_str'] == current_date].copy()
        
        # 计算每日平均值
        daily_factor = current_date_result.groupby(['date_str', 'security_code'])['rolling_apb'].mean().reset_index()
        daily_factor.columns = ['date', 'security_code', 'rolling_apb']
        
        # 保存当天的结果到parquet文件
        daily_factor.to_parquet(output_file)
        thread_logger.info(f"已保存日期 {current_date} 的结果到 {output_file}")
        
        # 清理内存
        del combined_data, pivot_data, rolling_avg, rolling_result, current_date_result, daily_factor
        gc.collect()
        
        return current_date, True, f"处理成功，结果已保存到 {output_file}"
        
    except Exception as e:
        error_msg = f"处理日期 {current_date} 时发生错误: {str(e)}\n{traceback.format_exc()}"
        thread_logger.error(error_msg)
        return current_date, False, error_msg

def calculate_rolling_average(window_size=1200, daily_min=400, start_date=None, end_date=None, max_workers=4):
    """计算滚动平均APB因子，采用多线程分批处理方式"""
    # 设置主logger
    logger = logging.getLogger("main")
    if not logger.handlers:
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        logger.setLevel(logging.INFO)
    
    data_dir = "./factors/minute_level_APB"  
    parquet_files = sorted(glob.glob(os.path.join(data_dir, "*.parquet")))

    if not parquet_files:
        raise ValueError("未找到分钟级数据文件")

    logger.info(f"找到 {len(parquet_files)} 个分钟级数据文件")
    
    # 假设文件名是日期格式，如 "20210602.parquet"
    # 提取文件名中的日期并排序
    file_dates = []
    for file_path in parquet_files:
        file_name = os.path.basename(file_path)
        date_str = file_name.split('.')[0]  # 假设文件名格式为 "YYYYMMDD.parquet"
        file_dates.append((date_str, file_path))
    
    # 按日期排序
    file_dates.sort(key=lambda x: x[0])
    
    # 如果指定了日期范围，筛选文件
    if start_date and end_date:
        file_dates = [(date, path) for date, path in file_dates 
                      if start_date <= date <= end_date]
    
    if not file_dates:
        raise ValueError("筛选后没有符合条件的数据文件")
    
    output_dir = "./factors/minute_level_APB/1200_Rolling"

    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)

    # 准备任务列表
    tasks = []
    for i in range(5, len(file_dates)):  # 从第6个文件开始，确保有前5天的数据
        current_date, current_file = file_dates[i]
        
        # 检查输出文件是否已存在（快速检查，避免创建不必要的任务）
        output_file = os.path.join(output_dir, f"{current_date}.parquet")
        if os.path.exists(output_file):
            logger.info(f"日期 {current_date} 的结果文件已存在，跳过处理")
            continue
            
        # 获取当前日期和前5天的文件
        start_idx = max(0, i-5)  # 确保不会索引到负数
        recent_files = [path for _, path in file_dates[start_idx:i+1]]
        
        tasks.append((current_date, current_file, recent_files))
    
    if not tasks:
        logger.info("没有需要处理的新数据")
        return output_dir
    
    logger.info(f"准备处理 {len(tasks)} 个日期的数据")
    
    # 创建处理函数的偏函数，固定window_size和min_periods参数
    process_func = partial(process_single_date, window_size=window_size, min_periods=daily_min, output_dir=output_dir)
    
    # 使用线程池并行处理
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_date = {executor.submit(process_func, task): task[0] for task in tasks}
        for future in concurrent.futures.as_completed(future_to_date):
            date = future_to_date[future]
            try:
                result = future.result()
                results.append(result)
                logger.info(f"日期 {date} 处理完成: {result[2]}")
            except Exception as exc:
                logger.error(f"日期 {date} 处理失败: {str(exc)}")
    
    # 汇总处理结果
    success_count = sum(1 for _, success, _ in results if success)
    logger.info(f"处理完成。成功: {success_count}/{len(tasks)}")
    
    return output_dir


In [12]:
def calculate_rolling_20d_avg(factor_names, factor_dir, min_window=5):
    
    if not isinstance(factor_names, list):
        factor_names = [factor_names]
    
    parquet_pattern = os.path.join(factor_dir, "*.parquet")
    parquet_files = glob.glob(parquet_pattern)
    print(f"在目录 {factor_dir} 中找到 {len(parquet_files)} 个Parquet文件")
    

    conn = duckdb.connect(database=':memory:')
    all_factor_data = conn.execute(f"""
        SELECT * FROM read_parquet('{parquet_pattern}')
    """).fetchdf()
    conn.close()
    

    missing_factors = [f for f in factor_names if f not in all_factor_data.columns]
    if missing_factors:
        raise ValueError(f"在Parquet文件中未找到以下因子列: {', '.join(missing_factors)}")
    
    all_factor_data['date'] = pd.to_datetime(all_factor_data['date'])

    all_factor_data = all_factor_data.sort_values(['date', 'security_code'])


    # 获取所有个股代码
    all_securities = all_factor_data['security_code'].unique()

    result_df = all_factor_data[['date', 'security_code']].copy()

    # 为每个因子计算滚动平均
    for factor_name in factor_names:
        print(f"\n处理因子: {factor_name}")
        
        #转置成宽表
        pivot_data = all_factor_data.pivot(index='date', columns='security_code', values=factor_name)

        # 对宽表直接应用rolling
        rolling_avg = pivot_data.rolling(window=20, min_periods=min_window).mean()

        # 将结果转换回长格式
        factor_df = rolling_avg.stack().reset_index()
        factor_df.columns = ['date', 'security_code', factor_name]
        factor_df[factor_name] = - factor_df[factor_name]

        # 计算每日因子覆盖率
        # 计算每个日期非NaN的因子值数量
        non_nan_counts = factor_df.dropna(subset=[factor_name]).groupby('date').size()

        # 计算覆盖率
        coverage = non_nan_counts / len(all_securities)
        print(f"\n{factor_name}因子覆盖率统计: 平均={coverage.mean():.2f}, 最小={coverage.min():.2f}")

        result_df = pd.merge(
            result_df, 
            factor_df,
            on=['date', 'security_code'],
            how='left'
        )
    
    return result_df

In [13]:
def main():
    # 获取所有交易日期
    start_date = '2019-01-01'
    end_date = '2025-05-31'

    trading_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # 1. 计算所有交易日的分钟级APB
    process_all_trading_days(trading_dates)
    
    # # 2. 计算滚动平均和每日因子值
    # rolling_df_dir = calculate_rolling_average(window_size=1200,daily_min=400)

    # factor_name ='rolling_apb'
    # daily_factor = calculate_rolling_20d_avg(factor_name,rolling_df_dir)
    
    # # 保存结果为CSV
    # output_path = "/data/home/lexuanchen/Factors/Order/Signal/Improved_APB"
    # os.makedirs(output_path, exist_ok=True)
    # output_file_path = f"{output_path}/MinuteRoll_Raw_Order_APB.csv"
    # daily_factor.to_csv(output_file_path, index=False)
    
    # print("APB因子计算完成")
    # print(daily_factor.head())

if __name__ == "__main__":
    main()


In [14]:
# 查看结果统计
output_path = "/data/home/lexuanchen/Factors/Order/Signal/Improved_APB"
order_pth = f"{output_path}/MinuteRoll_Raw_Order_APB.csv"
factor_df = pd.read_csv(f'{order_pth}')
factor_name = 'rolling_apb'
factor_df[factor_name] = - factor_df[factor_name]

output_path = "/data/home/lexuanchen/Factors/Order/Signal/Improved_APB"
os.makedirs(output_path, exist_ok=True)
output_file_path = f"{output_path}/MinuteRoll_Raw_Order_APB.csv"
factor_df.to_csv(output_file_path,index=False)

KeyboardInterrupt: 

In [None]:
import logging

# 设置日志
logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)


def calculate_minute_level_apb(trading_date):
    """计算分钟级APB因子"""
    try:
        # 尝试在函数内部导入pandas，避免全局导入
        import pandas as pd
        
        # 如果传入的是字符串，转换为datetime对象
        if isinstance(trading_date, str):
            trading_date = pd.to_datetime(trading_date)
        
        date_str = trading_date.strftime('%Y%m%d')
        
        order_pth = f"/data/cephfs/order/{date_str}.parquet"
        
        # 确保输出目录存在
        output_dir = "./factors/Ask_minute_level_APB"
        os.makedirs(output_dir, exist_ok=True)

        output_file = f"{output_dir}/{date_str}.parquet"
        
        if not os.path.exists(order_pth):
            # logger.info(f"订单数据文件不存在: {order_pth}")
            return
        
        # 如果该日期的文件已存在，跳过处理
        if os.path.exists(output_file):
            logger.info(f"日期 {date_str} 已处理，跳过")
            return

        conn = duckdb.connect(database=':memory:')

        # 1. 读取毫秒级订单簿数据并转换为分钟级
        query_order = f"""
            WITH buy_orders_with_time AS(
                SELECT 
                    security_code,
                    order_side, 
                    order_type,
                    order_details,
                    order_price,
                    order_price_adj,
                    order_volume,
                    order_time,
                    ('{trading_date}'::DATE + MAKE_TIME(
                        FLOOR(order_time / 10000000)::int,
                        (FLOOR(order_time / 100000) % 100)::int,
                        (FLOOR(order_time / 1000) % 100)::numeric + 
                        (order_time % 1000)::numeric / 1000)
                    )::TIMESTAMP AS formatted_time
                FROM '{order_pth}'
                WHERE order_side = 1
                    AND order_type = 'A'
                    AND order_details = 'L'
                    AND order_time >= 93000000
                    AND order_time < 145700000
            )
            SELECT
                security_code,
                order_price,
                order_volume,
                DATE_TRUNC('minute', formatted_time) AS order_minute
            FROM buy_orders_with_time
            """

        # 将数据加载到临时表中
        conn.execute(f"CREATE TEMPORARY TABLE minute_order AS {query_order}")

        # 2. 计算每分钟的VWAP和TWAP
        minute_apb_query = f"""
            WITH minute_level_stats AS (
                -- 直接计算每分钟的VWAP和TWAP
                SELECT 
                    security_code,
                    order_minute,
                    SUM(order_volume * order_price) / SUM(order_volume) AS vwap,
                    AVG(order_price) AS twap,
                    SUM(order_volume) AS total_volume
                FROM minute_order
                GROUP BY security_code, order_minute
            )
            -- 计算APB因子
            SELECT 
                '{date_str}' AS date,
                security_code,
                order_minute,
                vwap,
                twap,
                CASE 
                    WHEN twap > 0 AND vwap > 0 THEN LN(twap / vwap)
                    ELSE NULL
                END AS apb,
                (twap - vwap) / vwap AS apb_alt,
                total_volume
            FROM minute_level_stats
            WHERE total_volume > 0
            ORDER BY security_code
        """

        # 3. 保存分钟级数据
        minute_apb_df = conn.execute(minute_apb_query).fetchdf()
        
        # 保存分钟级数据
        minute_apb_df.to_parquet(output_file)
        
        logger.info(f"成功处理日期: {date_str}")
        return True
    except Exception as e:
        logger.error(f"处理日期 {trading_date} 时出错: {str(e)}")
        logger.error(traceback.format_exc())
        return False


def worker_process(date):
    """工作进程函数"""
    try:
        result = calculate_minute_level_apb(date)
        return (date, result, None)
    except Exception as e:
        return (date, False, str(e))


def process_all_trading_days(trading_dates, num_processes=None):
    """使用多进程处理所有交易日的数据
    
    Args:
        trading_dates: 要处理的交易日列表
        num_processes: 进程数，默认为CPU核心数
    """
    if num_processes is None:
        num_processes = multiprocessing.cpu_count()
    
    # 确保输出目录存在
    output_dir = "./factors/Percent_Bid_minute_level_APB"
    os.makedirs(output_dir, exist_ok=True)
    
    # 过滤掉已经处理过的日期
    dates_to_process = []
    for date in trading_dates:
        # 如果是字符串格式，转换为datetime对象
        if isinstance(date, str):
            try:
                date_obj = datetime.strptime(date, '%Y-%m-%d')
            except ValueError:
                try:
                    date_obj = datetime.strptime(date, '%Y%m%d')
                except ValueError:
                    logger.error(f"无法解析日期格式: {date}")
                    continue
        else:
            date_obj = date
            
        date_str = date_obj.strftime('%Y%m%d')
        output_file = f"{output_dir}/{date_str}.parquet"
        
        if not os.path.exists(output_file):
            dates_to_process.append(date)
    
    total_dates = len(dates_to_process)
    if total_dates == 0:
        logger.info("所有日期已处理完毕，无需再次处理")
        return
    
    logger.info(f"开始处理 {total_dates} 个交易日，使用 {num_processes} 个进程")
    
    # 创建进程池
    pool = multiprocessing.Pool(processes=num_processes)
    
    # 提交所有任务
    results = []
    for date in dates_to_process:
        result = pool.apply_async(worker_process, (date,))
        results.append(result)
    
    # 关闭进程池，不再接受新任务
    pool.close()
    
    # 显示进度
    completed = 0
    while completed < total_dates:
        completed = sum(1 for r in results if r.ready())
        logger.info(f"进度: {completed}/{total_dates}")
        time.sleep(10)  # 每10秒更新一次进度
    
    # 等待所有进程完成
    pool.join()
    
    # 处理结果
    success_count = 0
    for result in results:
        date, success, error = result.get()
        if success:
            success_count += 1
        else:
            logger.error(f"处理日期 {date} 时出错: {error}")
    
    logger.info(f"处理完成: 成功 {success_count}/{total_dates}")


# 使用示例
if __name__ == "__main__":

    start_date = '2019-01-01'
    end_date = '2025-05-31'

    trading_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    
    process_all_trading_days(trading_dates, num_processes=20)


2025-06-30 13:21:22,434 - __main__ - INFO - 开始处理 790 个交易日，使用 20 个进程


2025-06-30 13:21:22,651 - __main__ - INFO - 进度: 0/790
2025-06-30 13:21:32,658 - __main__ - INFO - 进度: 790/790
2025-06-30 13:21:42,660 - __main__ - ERROR - 处理日期 2019-01-01 00:00:00 时出错: None
2025-06-30 13:21:42,661 - __main__ - ERROR - 处理日期 2019-01-05 00:00:00 时出错: None
2025-06-30 13:21:42,662 - __main__ - ERROR - 处理日期 2019-01-06 00:00:00 时出错: None
2025-06-30 13:21:42,664 - __main__ - ERROR - 处理日期 2019-01-12 00:00:00 时出错: None
2025-06-30 13:21:42,665 - __main__ - ERROR - 处理日期 2019-01-13 00:00:00 时出错: None
2025-06-30 13:21:42,666 - __main__ - ERROR - 处理日期 2019-01-19 00:00:00 时出错: None
2025-06-30 13:21:42,667 - __main__ - ERROR - 处理日期 2019-01-20 00:00:00 时出错: None
2025-06-30 13:21:42,668 - __main__ - ERROR - 处理日期 2019-01-26 00:00:00 时出错: None
2025-06-30 13:21:42,668 - __main__ - ERROR - 处理日期 2019-01-27 00:00:00 时出错: None
2025-06-30 13:21:42,669 - __main__ - ERROR - 处理日期 2019-02-02 00:00:00 时出错: None
2025-06-30 13:21:42,670 - __main__ - ERROR - 处理日期 2019-02-03 00:00:00 时出错: None
2025-06-30