In [1]:
import connectorx as cx
import pandas as pd
import numpy as np
import os
import duckdb
from datetime import datetime
import glob
import multiprocessing as mp
from functools import partial
import time

In [2]:
def calculate_daily_factor(trading_date):
    order_pth = f"/data/cephfs/order/{pd.to_datetime(trading_date).strftime('%Y%m%d')}.parquet"

    if not os.path.exists(order_pth):
        raise FileNotFoundError(f"订单数据文件不存在: {order_pth}")

    conn = duckdb.connect(database=':memory:')

    query_order = f"""
        WITH buy_orders_with_time AS(
            SELECT 
                security_code,
                order_side, 
                order_type,
                order_details,
                order_price,
                order_price_adj,
                order_volume,
                order_time
            FROM '{order_pth}'
            WHERE order_side = 1
                AND order_type = 'A'
                AND order_details = 'L'
                AND order_price > 0
                AND order_volume > 0
                AND order_time >= 93000000
                AND order_time < 145700000
        )
        SELECT
            security_code,
            order_price,
            order_volume,
            order_time
        FROM buy_orders_with_time
        """

    # 将数据加载到临时表中
    conn.execute(f"CREATE TEMPORARY TABLE minute_order AS {query_order}")

                
    # 计算买方价格分位数
    bid_percentile_query = """
    CREATE TEMPORARY TABLE bid_percentiles AS
    SELECT 
        security_code,
        PERCENTILE_CONT(0.01) WITHIN GROUP (ORDER BY order_price) AS p01,
        PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY order_price) AS p99,
        PERCENTILE_CONT(0.05) WITHIN GROUP (ORDER BY order_price) AS p05,
        PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY order_price) AS p95
    FROM minute_order
    GROUP BY security_code
    """
    conn.execute(bid_percentile_query)
    
    # 创建买方P01-P99缩尾表
    bid_winsorized_query_p01_p99 = f"""
    CREATE TEMPORARY TABLE minute_order_p01_p99 AS
    SELECT 
        b.security_code,
        CASE 
            WHEN b.order_price < p.p01 THEN p.p01
            WHEN b.order_price > p.p99 THEN p.p99
            ELSE b.order_price
        END AS order_price,
        b.order_volume,
        b.order_time
    FROM minute_order b
    JOIN bid_percentiles p ON b.security_code = p.security_code
    """
    conn.execute(bid_winsorized_query_p01_p99)
    # created_tables.append(f'minute_order_p01_p99')
    

    bid_winsorized_query_p05_p95 = f"""
    CREATE TEMPORARY TABLE minute_order_p05_p95 AS
    SELECT 
        b.security_code,
        CASE 
            WHEN b.order_price < p.p05 THEN p.p05
            WHEN b.order_price > p.p95 THEN p.p95
            ELSE b.order_price
        END AS order_price,
        b.order_volume,
        b.order_time
    FROM minute_order b
    JOIN bid_percentiles p ON b.security_code = p.security_code
    """
    conn.execute(bid_winsorized_query_p05_p95)
    # created_tables.append(f'minute_order_p05_p95')


    P01_P99_APB_query = """
    SELECT 
        security_code,
        AVG(order_price) AS twap,
        SUM(order_volume * order_price)/SUM(order_volume) AS vwap
    FROM minute_order_p01_p99
    GROUP BY security_code
    """

    conn.execute(f"CREATE TEMPORARY TABLE order_value_p01_p99 AS {P01_P99_APB_query}")

    # 计算因子
    P01_P99_APB_Factors = conn.execute("""
    SELECT 
        security_code,
        CASE 
            WHEN twap > 0 AND vwap > 0 THEN LN(twap / vwap)
            ELSE NULL
        END AS daily_APB_p01_p99
    FROM order_value_p01_p99
    """).fetchdf()

    P05_P95_APB_query = """
    SELECT 
        security_code,
        AVG(order_price) AS twap,
        SUM(order_volume * order_price)/SUM(order_volume) AS vwap
    FROM minute_order_p05_p95
    GROUP BY security_code
    """

    conn.execute(f"CREATE TEMPORARY TABLE order_value_p05_p95 AS {P05_P95_APB_query}")

    # 计算因子
    P05_P95_APB_Factors = conn.execute("""
    SELECT 
        security_code,
        CASE 
            WHEN twap > 0 AND vwap > 0 THEN LN(twap / vwap)
            ELSE NULL
        END AS daily_APB_p05_p95
    FROM order_value_p05_p95
    """).fetchdf()

    Factor_df = pd.merge(
        P01_P99_APB_Factors,
        P05_P95_APB_Factors,
        on='security_code',
        how='inner'
    )


    return Factor_df




In [6]:
def derive_daily_factor (start_date, end_date, factor_name):
    
    #交易日列表
    trading_dates = pd.date_range(start=start_date, end=end_date, freq='B') 
    output_dir = f"./factors/{factor_name}"
    os.makedirs(output_dir, exist_ok=True)
    # 记录处理结果
    processed_dates = []
    skipped_dates = []
    
    for trading_date in trading_dates:
        date_str = trading_date.strftime('%Y%m%d')
        file_path = f"{output_dir}/{date_str}.parquet"
        
        # 如果文件已存在，跳过处理
        if os.path.exists(file_path):
            print(f"文件已存在，跳过: {file_path}")
            processed_dates.append(date_str)
            continue
        
        # 检查原始数据文件是否存在
        order_pth = f"/data/cephfs/order/{date_str}.parquet"

        if not os.path.exists(order_pth):
            print(f"跳过日期 {date_str}: 订单数据文件不存在")
            skipped_dates.append(date_str)
            continue
        
        try:
            # 计算因子
            daily_factor_df = calculate_daily_factor(trading_date)
            
            # 添加日期列
            daily_factor_df['date'] = date_str
            
            # 保存结果
            daily_factor_df.to_parquet(file_path, index=False)
            print(f"已生成因子文件: {file_path}")
            processed_dates.append(date_str)
            
        except Exception as e:
            print(f"处理日期 {date_str} 时出错: {str(e)}")
            skipped_dates.append(date_str)
    
    # 打印处理结果摘要
    print(f"\n处理完成:")
    print(f"成功处理 {len(processed_dates)} 个日期")
    print(f"跳过 {len(skipped_dates)} 个日期")
    
    return output_dir

In [3]:
def calculate_rolling_20d_avg(factor_names, factor_dir, start_date=None, end_date=None, min_window=5):
    
    if not isinstance(factor_names, list):
        factor_names = [factor_names]
    
    parquet_pattern = os.path.join(factor_dir, "*.parquet")
    parquet_files = glob.glob(parquet_pattern)
    print(f"在目录 {factor_dir} 中找到 {len(parquet_files)} 个Parquet文件")
    

    conn = duckdb.connect(database=':memory:')
    all_factor_data = conn.execute(f"""
        SELECT * FROM read_parquet('{parquet_pattern}')
    """).fetchdf()
    conn.close()

    all_factor_data.rename(columns={'daily_APB_p01_p99_x':'daily_APB_p01_p99','daily_APB_p01_p99_y':'daily_APB_p05_p95'},inplace=True)
    

    missing_factors = [f for f in factor_names if f not in all_factor_data.columns]
    if missing_factors:
        raise ValueError(f"在Parquet文件中未找到以下因子列: {', '.join(missing_factors)}")
    
    all_factor_data['date'] = pd.to_datetime(all_factor_data['date'])

    all_factor_data = all_factor_data.sort_values(['date', 'security_code'])
    
    # 筛选日期范围
    if start_date:
        start_date = pd.to_datetime(start_date)
        all_factor_data = all_factor_data[all_factor_data['date'] >= start_date]
    if end_date:
        end_date = pd.to_datetime(end_date)
        all_factor_data = all_factor_data[all_factor_data['date'] <= end_date]

    # 获取所有个股代码
    all_securities = all_factor_data['security_code'].unique()

    result_df = all_factor_data[['date', 'security_code']].copy()

    # 为每个因子计算滚动平均
    for factor_name in factor_names:
        print(f"\n处理因子: {factor_name}")
        
        #转置成宽表
        pivot_data = all_factor_data.pivot(index='date', columns='security_code', values=factor_name)

        # 对宽表直接应用rolling
        rolling_avg = pivot_data.rolling(window=20, min_periods=min_window).mean()

        # 将结果转换回长格式
        factor_df = rolling_avg.stack().reset_index()
        factor_df.columns = ['date', 'security_code', factor_name]

        factor_df.to_csv(f"/data/home/lexuanchen/Factors/Order/Signal/Alltime_Improved_APB_2/D_D_Order_Bid_20d_{factor_name}.csv")

        # 计算每日因子覆盖率
        # 计算每个日期非NaN的因子值数量
        non_nan_counts = factor_df.dropna(subset=[factor_name]).groupby('date').size()

        # 计算覆盖率
        coverage = non_nan_counts / len(all_securities)
        print(f"\n{factor_name}因子覆盖率统计: 平均={coverage.mean():.2f}, 最小={coverage.min():.2f}")

        result_df = pd.merge(
            result_df, 
            factor_df,
            on=['date', 'security_code'],
            how='left'
        )
    
    return result_df

In [4]:
def process_single_date(trading_date, factor_name):
    
    """处理单个交易日的函数"""
   
    date_str = trading_date.strftime('%Y%m%d')
    output_dir = f"./factors/{factor_name}"
    file_path = f"{output_dir}/{date_str}.parquet"
    
    # 如果文件已存在，跳过处理
    if os.path.exists(file_path):
        print(f"文件已存在，跳过: {file_path}")
        return date_str, True, "已存在"
    
    # 检查原始数据文件是否存在
    data_file = f"/data/cephfs/order/{date_str}.parquet"
    if not os.path.exists(data_file):
        print(f"跳过日期 {date_str}: 原始数据文件不存在")
        return date_str, False, "数据文件不存在"
    
    try:
        daily_factor_df = calculate_daily_factor(trading_date)
        
        daily_factor_df['date'] = date_str

        daily_factor_df.to_parquet(file_path, index=False)
        print(f"已生成因子文件: {file_path}")
        return date_str, True, "成功"
        
    except Exception as e:
        error_msg = str(e)
        print(f"处理日期 {date_str} 时出错: {error_msg}")
        return date_str, False, error_msg


def derive_daily_factor(start_date, end_date, factor_name, num_processes=30):
    start_time = time.time()
    
    trading_dates = pd.date_range(start=start_date, end=end_date, freq='B')
    
    # 创建输出目录
    output_dir = f"./factors/{factor_name}"
    os.makedirs(output_dir, exist_ok=True)
    
    # 创建进程池
    pool = mp.Pool(processes=num_processes)
    
    # 创建带有固定参数的处理函数
    process_date_with_args = partial(process_single_date, factor_name=factor_name)
    
    # 提交所有任务到进程池并获取结果
    print(f"开始使用 {num_processes} 个进程并行处理 {len(trading_dates)} 个交易日...")
    results = pool.map(process_date_with_args, trading_dates)
    
    # 关闭进程池
    pool.close()
    pool.join()
    
    # 分析处理结果
    processed_dates = [date_str for date_str, success, _ in results if success]
    skipped_dates = [date_str for date_str, success, _ in results if not success]
    
    # 打印处理结果摘要
    print(f"\n处理完成:")
    print(f"成功处理 {len(processed_dates)} 个日期")
    print(f"跳过 {len(skipped_dates)} 个日期")

    return output_dir


In [None]:
def main():

    start_date = '2016-06-20'
    end_date = '2025-06-30'

    factor_name = "Percent_D_D_Bid_daily_APB"
    
    factor_dir = derive_daily_factor (start_date, end_date, factor_name)

    factor_dir = "/data/home/lexuanchen/.conda/envs/Order_Improvement/factors/Percent_D_D_Bid_daily_APB"
    
    print(f"所有因子文件已生成在目录: {factor_dir}")


    factor_names = ['daily_APB_p01_p99', 'daily_APB_p05_p95']

    result_df = calculate_rolling_20d_avg(factor_names, factor_dir, start_date, end_date, min_window=5)
    
    #保存结果为CSV
    output_path = "/data/home/lexuanchen/Factors/Order/Signal/Alltime_Improved_APB_2"
    os.makedirs(output_path, exist_ok=True)

    result_df.to_csv(f'{output_path}/D_D_Order_Bid_Percent_20d_APB.csv')

    print(f"共计 {len(result_df)} 条记录")

    print("\n数据预览:")
    print(result_df.head())

# 执行主函数
if __name__ == "__main__":
    main()


文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20160620.parquet文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20160718.parquet文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20160815.parquet文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20160912.parquet文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20161010.parquet文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20161107.parquet
跳过日期 20170102: 原始数据文件不存在文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20170227.parquet
文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20161205.parquet跳过日期 20170130: 原始数据文件不存在
文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20170424.parquet文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20170327.parquet文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20160621.parquet文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20170619.parquet

文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20170717.parquet文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20160719.parquet文件已存在，跳过: ./factors/Percent_D_D_Bid_daily_APB/20170522.parquet


文件已存在，跳过: ./fa

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20210316.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20220118.parquet
已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20220824.parquet
已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20210317.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20210318.parquet
已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20220119.parquet
已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20220825.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20210319.parquet
已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20220826.parquet
已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20220120.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20220829.parquet
已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20220121.parquet
已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20220830.parquet
已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20220831.parquet
已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20220901.parquet
已生成因子文件: ./factors/Percent_D_D_Bid_daily_APB/20220902.parquet

处理完成:
成功处理 2193 个日期
跳过 163 个日期
所有因子文件已生成在目录: /data/home/lexuanchen/.conda/envs/Order_Improvement/factors/Percent_D_D_Bid_daily_APB
在目录 /data/home/lexuanchen/.conda/envs/Order_Improvement/factors/Percent_D_D_Bid_daily_APB 中找到 2193 个Parquet文件


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


处理因子: daily_APB_p01_p99

daily_APB_p01_p99因子覆盖率统计: 平均=0.77, 最小=0.47

处理因子: daily_APB_p05_p95

daily_APB_p05_p95因子覆盖率统计: 平均=0.77, 最小=0.47


NameError: name '保存结果为CSV' is not defined

In [None]:
output_path = "/data/home/lexuanchen/Factors/Order/Signal/Alltime_Improved_APB"
df = pd.read_csv(f'{output_path}/D_D_Ask_daily_APB.csv')
df.rename(columns={'daily_APB':'D_D_Ask_daily_APB'},inplace=True)
df.to_csv(f'{output_path}/D_D_Ask_daily_APB.csv')


# stats = df['daily_APB'].describe()
# print(stats)