In [3]:
import connectorx as cx
import pandas as pd
import numpy as np
import os
import duckdb
from datetime import datetime
import glob
import multiprocessing as mp
from functools import partial
import time

from tqdm import tqdm


In [9]:
# # 计算大单买卖比例因子
# def calculate_daily_factor(trading_date):
#     order_pth = f"/data/HighFreqData/Order/l2order/{pd.to_datetime(trading_date).strftime('%Y%m%d')}.parquet"

#     # 检查文件是否存在
#     if not os.path.exists(order_pth):
#         raise FileNotFoundError(f"数据文件不存在: {order_pth}")

#     conn = duckdb.connect(database=':memory:')
    
#     # 首先获取早盘订单数据
#     query = f"""
#     SELECT 
#         security_code,
#         order_side, 
#         order_type,
#         order_details,
#         order_price,
#         order_price_adj,
#         order_volume,
#         order_time
#     FROM '{order_pth}'
#     WHERE order_time >= 93000000 
#         AND order_time < 103000000
#         AND order_type = 'A'
#         AND order_details = 'L'
#     """
    
#     early_orders = conn.execute(query).fetchdf()
    
#     # 对每只股票分别处理
#     result_data = []
#     epsilon = 1e-10  # 数值稳定性
    
#     for code in early_orders['security_code'].unique():
#         try:
#             stock_orders = early_orders[early_orders['security_code'] == code]
            
#             # 如果订单数量太少，跳过
#             if len(stock_orders) < 10:
#                 continue
            
#             # 确定大单阈值（按订单量大小排序，取前10%）
#             large_order_threshold = stock_orders['order_volume'].quantile(0.9)
            
#             # 筛选大单
#             large_orders = stock_orders[stock_orders['order_volume'] >= large_order_threshold]
            
#             # 如果大单数量太少，跳过
#             if len(large_orders) < 5:
#                 continue
            
#             # 计算大单中买单和卖单的数量和总量
#             buy_orders = large_orders[large_orders['order_side'] == 1]
#             sell_orders = large_orders[large_orders['order_side'] == -1]

#             all_buy_orders = stock_orders[stock_orders['order_side'] == 1]
#             all_sell_orders = stock_orders[stock_orders['order_side'] == -1]
            
#             # 安全获取数量和总量
#             buy_count = len(buy_orders)
#             sell_count = len(sell_orders)
#             buy_volume = buy_orders['order_volume'].sum() if buy_count > 0 else 0
#             sell_volume = sell_orders['order_volume'].sum() if sell_count > 0 else 0
            
#             all_buy_count = len(all_buy_orders)
#             all_sell_count = len(all_sell_orders)
#             all_buy_volume = all_buy_orders['order_volume'].sum() if all_buy_count > 0 else 0
#             all_sell_volume = all_sell_orders['order_volume'].sum() if all_sell_count > 0 else 0
            
#             # 计算大单买卖比例指标（添加epsilon避免除零）
#             large_order_count_ratio = (buy_count + epsilon) / (sell_count + epsilon)
#             large_order_volume_ratio = (buy_volume + epsilon) / (sell_volume + epsilon)
            
#             # 取对数使分布更接近正态
#             log_count_ratio = np.log(large_order_count_ratio)
#             log_volume_ratio = np.log(large_order_volume_ratio)

#             # 安全计算净买卖比率
#             if buy_volume + sell_volume <= epsilon:
#                 large_net_order_volume_ratio = np.nan
#             else:
#                 ratio = (buy_volume - sell_volume)/(buy_volume + sell_volume)
#                 large_net_order_volume_ratio = np.sign(ratio) * np.log(1 + abs(ratio))
            
#             if all_buy_volume + all_sell_volume <= epsilon:
#                 net_order_volume_ratio = np.nan
#             else:
#                 ratio_all = (all_buy_volume - all_sell_volume)/(all_buy_volume + all_sell_volume)
#                 net_order_volume_ratio = np.sign(ratio_all) * np.log(1 + abs(ratio_all))
            
#             # 保存结果
#             result_data.append({
#                 'security_code': code,
#                 'large_order_count_ratio': log_count_ratio,
#                 'large_order_volume_ratio': log_volume_ratio,
#                 'large_order_avg_buy_size': buy_volume / (buy_count + epsilon) if buy_count > 0 else np.nan,
#                 'large_order_avg_sell_size': sell_volume / (sell_count + epsilon) if sell_count > 0 else np.nan,
#                 'large_order_threshold': large_order_threshold,
#                 'large_order_count': len(large_orders),
#                 'large_net_order_volume_ratio': large_net_order_volume_ratio,
#                 'net_order_volume_ratio': net_order_volume_ratio
#             })
#         except Exception as e:
#             print(f"处理股票 {code} 时出错: {str(e)}")
    
#     # 转换为DataFrame
#     factor_df = pd.DataFrame(result_data)
    
#     # 处理极端值
#     for col in ['large_order_count_ratio', 'large_order_volume_ratio', 
#                 'large_net_order_volume_ratio', 'net_order_volume_ratio']:
#         if col in factor_df.columns:
#             factor_df[col] = factor_df[col].replace([np.inf, -np.inf], np.nan)
    
#     return factor_df


In [10]:
def calculate_daily_factor(trading_date):
    order_pth = f"/data/HighFreqData/Order/l2order/{pd.to_datetime(trading_date).strftime('%Y%m%d')}.parquet"

    # 检查文件是否存在
    if not os.path.exists(order_pth):
        raise FileNotFoundError(f"数据文件不存在: {order_pth}")

    conn = duckdb.connect(database=':memory:')
    
    # 使用SQL完成所有计算
    query = f"""
    WITH early_orders AS (
        SELECT 
            security_code,
            order_side, 
            order_volume
        FROM '{order_pth}'
        WHERE order_time >= 93000000 
            AND order_time < 103000000
            AND order_type = 'A'
            AND order_details = 'L'
    ),
    
    -- 计算每只股票的订单量分位数
    stock_thresholds AS (
        SELECT 
            security_code,
            PERCENTILE_CONT(0.9) WITHIN GROUP (ORDER BY order_volume) AS large_order_threshold,
            COUNT(*) AS total_orders
        FROM early_orders
        GROUP BY security_code
        HAVING COUNT(*) >= 10
    ),
    
    -- 标记大单
    large_orders AS (
        SELECT 
            e.*,
            t.large_order_threshold
        FROM early_orders e
        JOIN stock_thresholds t ON e.security_code = t.security_code
        WHERE e.order_volume >= t.large_order_threshold
    ),
    
    -- 按股票分组计算大单统计
    large_order_stats AS (
        SELECT 
            security_code,
            large_order_threshold,
            COUNT(*) AS large_order_count,
            SUM(CASE WHEN order_side = 1 THEN 1 ELSE 0 END) AS buy_count,
            SUM(CASE WHEN order_side = -1 THEN 1 ELSE 0 END) AS sell_count,
            SUM(CASE WHEN order_side = 1 THEN order_volume ELSE 0 END) AS buy_volume,
            SUM(CASE WHEN order_side = -1 THEN order_volume ELSE 0 END) AS sell_volume
        FROM large_orders
        GROUP BY security_code, large_order_threshold
        HAVING COUNT(*) >= 5
    ),
    
    -- 计算所有订单的统计
    all_order_stats AS (
        SELECT 
            security_code,
            SUM(CASE WHEN order_side = 1 THEN order_volume ELSE 0 END) AS all_buy_volume,
            SUM(CASE WHEN order_side = -1 THEN order_volume ELSE 0 END) AS all_sell_volume
        FROM early_orders
        GROUP BY security_code
    )
    
    -- 最终计算各种比率
    SELECT 
        l.security_code,
        -- 大单买卖数量比率（取对数）
        LN((l.buy_count + 1e-10) / (l.sell_count + 1e-10)) AS large_order_count_ratio,
        -- 大单买卖量比率（取对数）
        LN((l.buy_volume + 1e-10) / (l.sell_volume + 1e-10)) AS large_order_volume_ratio,
        -- 大单平均买入规模
        CASE WHEN l.buy_count > 0 THEN l.buy_volume / l.buy_count ELSE NULL END AS large_order_avg_buy_size,
        -- 大单平均卖出规模
        CASE WHEN l.sell_count > 0 THEN l.sell_volume / l.sell_count ELSE NULL END AS large_order_avg_sell_size,
        -- 大单阈值
        l.large_order_threshold,
        -- 大单数量
        l.large_order_count,
        -- 大单净买卖量比率（带符号对数）
        CASE 
            WHEN (l.buy_volume + l.sell_volume) <= 1e-10 THEN NULL
            ELSE 
                SIGN((l.buy_volume - l.sell_volume) / (l.buy_volume + l.sell_volume)) * 
                LN(1 + ABS((l.buy_volume - l.sell_volume) / (l.buy_volume + l.sell_volume)))
        END AS large_net_order_volume_ratio,
        -- 所有订单净买卖量比率（带符号对数）
        CASE 
            WHEN (a.all_buy_volume + a.all_sell_volume) <= 1e-10 THEN NULL
            ELSE 
                SIGN((a.all_buy_volume - a.all_sell_volume) / (a.all_buy_volume + a.all_sell_volume)) * 
                LN(1 + ABS((a.all_buy_volume - a.all_sell_volume) / (a.all_buy_volume + a.all_sell_volume)))
        END AS net_order_volume_ratio
    FROM large_order_stats l
    JOIN all_order_stats a ON l.security_code = a.security_code
    """
    
    try:
        # 执行SQL查询
        factor_df = conn.execute(query).fetchdf()
        
        # 处理极端值（虽然SQL中已经处理了大部分情况，但以防万一）
        for col in ['large_order_count_ratio', 'large_order_volume_ratio', 
                    'large_net_order_volume_ratio', 'net_order_volume_ratio']:
            if col in factor_df.columns:
                factor_df[col] = factor_df[col].replace([np.inf, -np.inf], np.nan)
        
        return factor_df
    
    except Exception as e:
        print(f"SQL查询执行出错: {str(e)}")
        raise
    finally:
        conn.close()


In [None]:
def process_single_date(trading_date, factor_name):
    
    """处理单个交易日的函数"""
   
    date_str = trading_date.strftime('%Y%m%d')
    output_dir = f"./factors/{factor_name}"
    file_path = f"{output_dir}/{date_str}.parquet"
    
    # 如果文件已存在，跳过处理
    if os.path.exists(file_path):
        print(f"文件已存在，跳过: {file_path}")
        return date_str, True, "已存在"
    
    # 检查原始数据文件是否存在
    data_file = f"/data/HighFreqData/Order/l2order/{date_str}.parquet"
    if not os.path.exists(data_file):
        print(f"跳过日期 {date_str}: 原始数据文件不存在")
        return date_str, False, "数据文件不存在"
    
    try:
        daily_factor_df = calculate_daily_factor(trading_date)
        
        daily_factor_df['date'] = date_str

        daily_factor_df.to_parquet(file_path, index=False)
        print(f"已生成因子文件: {file_path}")
        return date_str, True, "成功"
        
    except Exception as e:
        error_msg = str(e)
        print(f"处理日期 {date_str} 时出错: {error_msg}")
        return date_str, False, error_msg


def derive_daily_factor(start_date, end_date, factor_name, num_processes=10):
    start_time = time.time()
    
    trading_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # 创建输出目录
    output_dir = f"./factors/{factor_name}"
    os.makedirs(output_dir, exist_ok=True)
    
    # 创建进程池
    pool = mp.Pool(processes=num_processes)
    
    # 创建带有固定参数的处理函数
    process_date_with_args = partial(process_single_date, factor_name=factor_name)
    
    # 提交所有任务到进程池并获取结果
    print(f"开始使用 {num_processes} 个进程并行处理 {len(trading_dates)} 个交易日...")
    results = pool.map(process_date_with_args, trading_dates)
    
    # 关闭进程池
    pool.close()
    pool.join()
    
    # 分析处理结果
    processed_dates = [date_str for date_str, success, _ in results if success]
    skipped_dates = [date_str for date_str, success, _ in results if not success]
    
    # 打印处理结果摘要
    print(f"\n处理完成:")
    print(f"成功处理 {len(processed_dates)} 个日期")
    print(f"跳过 {len(skipped_dates)} 个日期")

    return output_dir

In [4]:
def calculate_rolling_20d_avg(factor_names, factor_dir, start_date=None, end_date=None, min_window=5):
    
    if not isinstance(factor_names, list):
        factor_names = [factor_names]
    
    parquet_pattern = os.path.join(factor_dir, "*.parquet")
    parquet_files = glob.glob(parquet_pattern)
    print(f"在目录 {factor_dir} 中找到 {len(parquet_files)} 个Parquet文件")
    

    conn = duckdb.connect(database=':memory:')
    all_factor_data = conn.execute(f"""
        SELECT * FROM read_parquet('{parquet_pattern}')
    """).fetchdf()
    conn.close()
    

    missing_factors = [f for f in factor_names if f not in all_factor_data.columns]
    if missing_factors:
        raise ValueError(f"在Parquet文件中未找到以下因子列: {', '.join(missing_factors)}")
    
    all_factor_data['date'] = pd.to_datetime(all_factor_data['date'])

    all_factor_data = all_factor_data.sort_values(['date', 'security_code'])
    
    # 筛选日期范围
    if start_date:
        start_date = pd.to_datetime(start_date)
        all_factor_data = all_factor_data[all_factor_data['date'] >= start_date]
    if end_date:
        end_date = pd.to_datetime(end_date)
        all_factor_data = all_factor_data[all_factor_data['date'] <= end_date]

    # 获取所有个股代码
    all_securities = all_factor_data['security_code'].unique()

    result_df = all_factor_data[['date', 'security_code']].copy()

    # 为每个因子计算滚动平均
    for factor_name in factor_names:
        print(f"\n处理因子: {factor_name}")
        
        #转置成宽表
        pivot_data = all_factor_data.pivot(index='date', columns='security_code', values=factor_name)

        # 对宽表直接应用rolling
        rolling_avg = pivot_data.rolling(window=20, min_periods=min_window).mean()

        # 将结果转换回长格式
        factor_df = rolling_avg.stack().reset_index()
        factor_df.columns = ['date', 'security_code', factor_name]
        # factor_df[factor_name] = -factor_df[factor_name]

        # 计算每日因子覆盖率
        # 计算每个日期非NaN的因子值数量
        non_nan_counts = factor_df.dropna(subset=[factor_name]).groupby('date').size()

        # 计算覆盖率
        coverage = non_nan_counts / len(all_securities)
        print(f"\n{factor_name}因子覆盖率统计: 平均={coverage.mean():.2f}, 最小={coverage.min():.2f}")

        result_df = pd.merge(
            result_df, 
            factor_df,
            on=['date', 'security_code'],
            how='left'
        )
    
    return result_df

In [None]:
def main():

    start_date = '2021-05-18'
    end_date = '2024-12-31'
    
    all_factors = [
        'large_order_count_ratio',
        'large_order_volume_ratio',
        'large_order_avg_buy_size',
        'large_order_avg_sell_size',
        'large_order_threshold',
        'large_order_count',
        'large_net_order_volume_ratio',
        'net_order_volume_ratio'
    ]


    factor_name = "Improved_Early_Order_Size_Ratio"

    # trading_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # for trading_date in trading_dates:
        
    #     process_single_date(trading_date,factor_name)

    # factor_dir = output_dir = f"./factors/{factor_name}"

    factor_dir = derive_daily_factor(start_date, end_date, factor_name, num_processes=10)

    
    print(f"所有因子文件已生成在目录: {factor_dir}")

    result_df = calculate_rolling_20d_avg(all_factors, factor_dir, start_date, end_date, min_window=5)
    
    # 保存结果为CSV
    output_path = "/data/home/lexuanchen/Factors/Order/Signal/Improved_Early_Order_Size_Ratio"
    os.makedirs(output_path, exist_ok=True)

    print(f"共计 {len(result_df)} 条记录")

    print("\n数据预览:")
    print(result_df.head())


    for factor in all_factors:
        output_file_path = f"{output_path}/{factor}.csv"
        factor_df = result_df[['date', 'security_code', factor]]
        factor_df.to_csv(output_file_path, index=False)


# 执行主函数
if __name__ == "__main__":
    main()


文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20210621.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20220111.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20211104.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20210518.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20211208.parquet



文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20220214.parquet
文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20220112.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20211105.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20210622.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20211209.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20210519.parquet





文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20220113.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20210623.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20210520.parquet文件已存在，跳过: ./factors/Improved_Early_Or

跳过日期 20220326: 原始数据文件不存在跳过日期 20211114: 原始数据文件不存在

文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20210531.parquet

文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20220124.parquet



文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20210705.parquet跳过日期 20220226: 原始数据文件不存在
跳过日期 20210801: 原始数据文件不存在文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20211221.parquet
跳过日期 20210904: 原始数据文件不存在文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20211008.parquet跳过日期 20220327: 原始数据文件不存在文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20211115.parquet

文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20210601.parquet
文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20220125.parquet




跳过日期 20220227: 原始数据文件不存在文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20210706.parquet
文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20210802.parquet
文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio/20211222.parquet跳过日期 20210905: 原始数据文件不存在跳过日期 20211009: 原始数据文件不存在文件已存在，跳过: ./fact

In [5]:
def main():

    start_date = '2016-06-20'
    end_date = '2025-06-30'
    
    all_factors = [
    'early_930_1030_net_order_volume_ratio',
    'early_930_1000_net_order_volume_ratio',
    'early_1000_1030_net_order_volume_ratio',
    'early_930_1030_avg_order_volume_ratio',
    'early_930_1000_avg_order_volume_ratio',
    'early_1000_1030_avg_order_volume_ratio',
    'early_930_1030_sum_order_volume_ratio',
    'early_930_1000_sum_order_volume_ratio',
    'early_1000_1030_sum_order_volume_ratio'
    ]


    factor_name = "Improved_Early_Order_Size_Ratio_3_2"

    # trading_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # for trading_date in trading_dates:
        
    #     process_single_date(trading_date,factor_name)

    # factor_dir = output_dir = f"./factors/{factor_name}"

    factor_dir = "/data/home/lexuanchen/.conda/envs/Order_Improvement/factors/Improved_Early_Order_Size_Ratio_3"
    # factor_dir = derive_daily_factor(start_date, end_date, factor_name, num_processes=10)

    
    print(f"所有因子文件已生成在目录: {factor_dir}")

    result_df = calculate_rolling_20d_avg(all_factors, factor_dir, start_date, end_date, min_window=5)
    
    # 保存结果为CSV
    output_path = "/data/home/lexuanchen/Factors/Order/Signal/Improved_Early_Order_Size_Ratio_3_2"
    os.makedirs(output_path, exist_ok=True)

    print(f"共计 {len(result_df)} 条记录")

    print("\n数据预览:")
    print(result_df.head())


    for factor in all_factors:
        output_file_path = f"{output_path}/{factor}.csv"
        factor_df = result_df[['date', 'security_code', factor]]
        factor_df.to_csv(output_file_path, index=False)

# 执行主函数
if __name__ == "__main__":
    main()


所有因子文件已生成在目录: /data/home/lexuanchen/.conda/envs/Order_Improvement/factors/Improved_Early_Order_Size_Ratio_3
在目录 /data/home/lexuanchen/.conda/envs/Order_Improvement/factors/Improved_Early_Order_Size_Ratio_3 中找到 2193 个Parquet文件


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


处理因子: early_930_1030_net_order_volume_ratio

early_930_1030_net_order_volume_ratio因子覆盖率统计: 平均=0.77, 最小=0.47

处理因子: early_930_1000_net_order_volume_ratio

early_930_1000_net_order_volume_ratio因子覆盖率统计: 平均=0.77, 最小=0.47

处理因子: early_1000_1030_net_order_volume_ratio

early_1000_1030_net_order_volume_ratio因子覆盖率统计: 平均=0.77, 最小=0.47

处理因子: early_930_1030_avg_order_volume_ratio

early_930_1030_avg_order_volume_ratio因子覆盖率统计: 平均=0.77, 最小=0.47

处理因子: early_930_1000_avg_order_volume_ratio

early_930_1000_avg_order_volume_ratio因子覆盖率统计: 平均=0.77, 最小=0.47

处理因子: early_1000_1030_avg_order_volume_ratio

early_1000_1030_avg_order_volume_ratio因子覆盖率统计: 平均=0.77, 最小=0.47

处理因子: early_930_1030_sum_order_volume_ratio

early_930_1030_sum_order_volume_ratio因子覆盖率统计: 平均=0.77, 最小=0.47

处理因子: early_930_1000_sum_order_volume_ratio

early_930_1000_sum_order_volume_ratio因子覆盖率统计: 平均=0.77, 最小=0.47

处理因子: early_1000_1030_sum_order_volume_ratio

early_1000_1030_sum_order_volume_ratio因子覆盖率统计: 平均=0.77, 最小=0.47
共计 8996951 条记