In [1]:
import connectorx as cx
import pandas as pd
import numpy as np
import os
import duckdb
from datetime import datetime
import glob
import multiprocessing as mp
from functools import partial
import time

from tqdm import tqdm


In [2]:
def calculate_daily_factor(trading_date):
    order_pth = f"/data/cephfs/order/{pd.to_datetime(trading_date).strftime('%Y%m%d')}.parquet"

    # 检查文件是否存在
    if not os.path.exists(order_pth):
        raise FileNotFoundError(f"数据文件不存在: {order_pth}")

    # 定义六个时间段的查询
    time_intervals = {
        'early_930_1000': "order_time >= 93000000 AND order_time < 100000000",
        'early_1000_1030': "order_time >= 100000000 AND order_time < 103000000",
        'early_930_1030': "order_time >= 93000000 AND order_time < 103000000"
    }

    result_df = None
    
    # 使用SQL完成所有计算
    for interval_name, time_condition in time_intervals.items():
        conn = duckdb.connect(database=':memory:')

        # 修复SQL查询 - 添加了WITH关键字和最终的查询
        query = f"""
        WITH early_orders AS (
            SELECT 
                security_code,
                order_side, 
                order_volume
            FROM '{order_pth}'
            WHERE order_type = 'A'
                AND {time_condition}
                AND order_details = 'L'
        ),
        
        -- 计算所有订单的统计
        all_order_stats AS (
            SELECT 
                security_code,
                SUM(CASE WHEN order_side = 1 THEN order_volume ELSE 0 END) AS all_buy_volume,
                SUM(CASE WHEN order_side = -1 THEN order_volume ELSE 0 END) AS all_sell_volume,
                AVG(CASE WHEN order_side = 1 THEN order_volume ELSE NULL END) AS avg_all_buy_volume,
                AVG(CASE WHEN order_side = -1 THEN order_volume ELSE NULL END) AS avg_all_sell_volume
            FROM early_orders
            GROUP BY security_code
        )
        
        -- 最终计算各种比率
        SELECT 
            security_code,
            CASE 
                WHEN (all_buy_volume + all_sell_volume) <= 1e-10 THEN NULL
                ELSE 
                    (all_buy_volume - all_sell_volume) / (all_buy_volume + all_sell_volume)
            END AS {interval_name}_net_order_volume_ratio,
            LN((all_buy_volume + 1e-10)/(all_sell_volume + 1e-10)) AS {interval_name}_sum_order_volume_ratio,
            LN((avg_all_buy_volume + 1e-10)/(avg_all_sell_volume + 1e-10)) AS {interval_name}_avg_order_volume_ratio
        FROM all_order_stats 
        """
        
        try:
            # 执行SQL查询
            current_df = conn.execute(query).fetchdf()
            
            # 处理极端值
            for col in current_df.columns:
                if '_order_volume_ratio' in col:
                    current_df[col] = current_df[col].replace([np.inf, -np.inf], np.nan)
            
            if result_df is None:
                result_df = current_df
            else:
                result_df = result_df.merge(
                    current_df,
                    on='security_code',
                    how='outer'
                )
        
        except Exception as e:
            print(f"SQL查询执行出错 ({interval_name}): {str(e)}")
            print(f"查询内容: {query}")
            raise
        finally:
            conn.close()
    
    # 如果所有时间段都没有数据，返回空DataFrame
    if result_df is None:
        result_df = pd.DataFrame(columns=['security_code'])
    
    return result_df


In [3]:
def process_single_date(trading_date, factor_name):
    
    """处理单个交易日的函数"""
   
    date_str = trading_date.strftime('%Y%m%d')
    output_dir = f"./factors/{factor_name}"
    file_path = f"{output_dir}/{date_str}.parquet"
    
    # 如果文件已存在，跳过处理
    if os.path.exists(file_path):
        print(f"文件已存在，跳过: {file_path}")
        return date_str, True, "已存在"
    
    # 检查原始数据文件是否存在
    data_file = f"/data/cephfs/order/{date_str}.parquet"
    if not os.path.exists(data_file):
        print(f"跳过日期 {date_str}: 原始数据文件不存在")
        return date_str, False, "数据文件不存在"
    
    try:
        daily_factor_df = calculate_daily_factor(trading_date)
        
        daily_factor_df['date'] = date_str

        daily_factor_df.to_parquet(file_path, index=False)
        print(f"已生成因子文件: {file_path}")
        return date_str, True, "成功"
        
    except Exception as e:
        error_msg = str(e)
        print(f"处理日期 {date_str} 时出错: {error_msg}")
        return date_str, False, error_msg


def derive_daily_factor(start_date, end_date, factor_name, num_processes=10):
    start_time = time.time()
    
    trading_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # 创建输出目录
    output_dir = f"./factors/{factor_name}"
    os.makedirs(output_dir, exist_ok=True)
    
    # 创建进程池
    pool = mp.Pool(processes=num_processes)
    
    # 创建带有固定参数的处理函数
    process_date_with_args = partial(process_single_date, factor_name=factor_name)
    
    # 提交所有任务到进程池并获取结果
    print(f"开始使用 {num_processes} 个进程并行处理 {len(trading_dates)} 个交易日...")
    results = pool.map(process_date_with_args, trading_dates)
    
    # 关闭进程池
    pool.close()
    pool.join()
    
    # 分析处理结果
    processed_dates = [date_str for date_str, success, _ in results if success]
    skipped_dates = [date_str for date_str, success, _ in results if not success]
    
    # 打印处理结果摘要
    print(f"\n处理完成:")
    print(f"成功处理 {len(processed_dates)} 个日期")
    print(f"跳过 {len(skipped_dates)} 个日期")

    return output_dir

In [4]:
def calculate_rolling_20d_avg(factor_names, factor_dir, start_date=None, end_date=None, min_window=5):
    
    if not isinstance(factor_names, list):
        factor_names = [factor_names]
    
    updated_factor_names = []
    time_intervals = ['early_930_1000', 'early_1000_1030', 'early_930_1030']
    
    for base_name in factor_names:
        for interval in time_intervals:
            updated_factor_names.append(f"{interval}_{base_name}")
    
    # 读取所有Parquet文件
    parquet_files = glob.glob(os.path.join(factor_dir, "*.parquet"))
    all_factor_data = pd.concat([pd.read_parquet(f) for f in parquet_files], ignore_index=True)

    print(f"在目录 {factor_dir} 中找到 {len(parquet_files)} 个Parquet文件")
    
    
    # 检查是否所有需要的因子列都存在
    missing_factors = [f for f in updated_factor_names if f not in all_factor_data.columns]
    if missing_factors:
        raise ValueError(f"在Parquet文件中未找到以下因子列: {', '.join(missing_factors)}")

    
    all_factor_data['date'] = pd.to_datetime(all_factor_data['date'])

    all_factor_data = all_factor_data.sort_values(['date', 'security_code'])
    
    # 筛选日期范围
    if start_date:
        start_date = pd.to_datetime(start_date)
        all_factor_data = all_factor_data[all_factor_data['date'] >= start_date]
    if end_date:
        end_date = pd.to_datetime(end_date)
        all_factor_data = all_factor_data[all_factor_data['date'] <= end_date]

    # 获取所有个股代码
    all_securities = all_factor_data['security_code'].unique()

    result_df = all_factor_data[['date', 'security_code']].copy()

    # 为每个因子计算滚动平均
    for factor_name in updated_factor_names:
        print(f"\n处理因子: {factor_name}")
        
        #转置成宽表
        pivot_data = all_factor_data.pivot(index='date', columns='security_code', values=factor_name)

        # 对宽表直接应用rolling
        rolling_avg = pivot_data.rolling(window=20, min_periods=min_window).mean()

        # 将结果转换回长格式
        factor_df = rolling_avg.stack().reset_index()
        factor_df.columns = ['date', 'security_code', factor_name]

        # 计算每日因子覆盖率
        # 计算每个日期非NaN的因子值数量
        non_nan_counts = factor_df.dropna(subset=[factor_name]).groupby('date').size()

        # 计算覆盖率
        coverage = non_nan_counts / len(all_securities)
        print(f"\n{factor_name}因子覆盖率统计: 平均={coverage.mean():.2f}, 最小={coverage.min():.2f}")

        result_df = pd.merge(
            result_df, 
            factor_df,
            on=['date', 'security_code'],
            how='left'
        )
    
    return result_df

In [6]:
def main():

    start_date = '2025-05-20'
    end_date = '2025-06-30'


    factor_name = "Improved_Early_Order_Size_Ratio_3"

    base_factors = ['avg_order_volume_ratio', 'net_order_volume_ratio', 'sum_order_volume_ratio']
    factor_dir = derive_daily_factor(start_date, end_date, factor_name, num_processes=10)

    
    print(f"所有因子文件已生成在目录: {factor_dir}")

    result_df = calculate_rolling_20d_avg(base_factors, factor_dir, start_date, end_date, min_window=5)
    
    # 保存结果为CSV
    output_path = "/data/home/lexuanchen/Factors/Order/Signal/Improved_Early_Order_Size_Ratio_3"
    os.makedirs(output_path, exist_ok=True)

    print(f"共计 {len(result_df)} 条记录")

    print("\n数据预览:")
    print(result_df.head())

    
    # 时间段前缀
    time_intervals = ['early_930_1000', 'early_1000_1030', 'early_930_1030']
    
    # 为每个基础因子和时间段组合生成CSV
    for base_factor in base_factors:
        for interval in time_intervals:
            factor = f"{interval}_{base_factor}"
            if factor in result_df.columns:  # 确保列存在
                output_file_path = f"{output_path}/{factor}.csv"
                factor_df = result_df[['date', 'security_code', factor]]
                factor_df.to_csv(output_file_path, index=False)
            else:
                print(f"警告: 列 '{factor}' 在结果数据中不存在")



# 执行主函数
if __name__ == "__main__":
    main()


文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio_3/20250520.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio_3/20250526.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio_3/20250522.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio_3/20250528.parquet跳过日期 20250524: 原始数据文件不存在文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio_3/20250530.parquet跳过日期 20250601: 原始数据文件不存在跳过日期 20250607: 原始数据文件不存在






文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio_3/20250523.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio_3/20250521.parquet
文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio_3/20250527.parquet文件已存在，跳过: ./factors/Improved_Early_Order_Size_Ratio_3/20250529.parquet跳过日期 20250531: 原始数据文件不存在跳过日期 20250525: 原始数据文件不存在跳过日期 20250602: 原始数据文件不存在



跳过日期 20250608: 原始数据文件不存在



跳过日期 20250615: 原始数据文件不存在
跳过日期 20250621: 原始数据文件不存在
跳过日期 20250622: 原始数据文件不存在


开始使用 10 个进程并行处理 42 个交易日...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250617.parquet
已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250603.parquet
已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250623.parquet
已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250605.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250616.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250609.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250611.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250619.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250613.parquet
跳过日期 20250614: 原始数据文件不存在
跳过日期 20250629: 原始数据文件不存在


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250625.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250624.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250606.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250627.parquet
跳过日期 20250628: 原始数据文件不存在


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250620.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250612.parquet
已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250610.parquet
已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250604.parquet已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250618.parquet

已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250630.parquet
已生成因子文件: ./factors/Improved_Early_Order_Size_Ratio_3/20250626.parquet

处理完成:
成功处理 29 个日期
跳过 13 个日期
所有因子文件已生成在目录: ./factors/Improved_Early_Order_Size_Ratio_3
在目录 ./factors/Improved_Early_Order_Size_Ratio_3 中找到 2193 个Parquet文件

处理因子: early_930_1000_avg_order_volume_ratio

early_930_1000_avg_order_volume_ratio因子覆盖率统计: 平均=1.00, 最小=0.99

处理因子: early_1000_1030_avg_order_volume_ratio

early_1000_1030_avg_order_volume_ratio因子覆盖率统计: 平均=1.00, 最小=0.99

处理因子: early_930_1030_avg_order_volume_ratio

early_930_1030_avg_order_volume_ratio因子覆盖率统计: 平均=1.00, 最小=0.99

处理因子: early_930_1000_net_order_volume_ratio

early_930_1000_net_order_volume_ratio因子覆盖率统计: 平均=1