In [None]:
import connectorx as cx
import pandas as pd
import numpy as np
import os
import duckdb
from datetime import datetime
import glob
import multiprocessing as mp
from functools import partial
import time

def process_single_date(trading_date, output_dir):
    """处理单个交易日的函数"""
    
    date_str = trading_date.strftime('%Y%m%d')
    one_min_pth = f"/data/HighFreqData/MinuteQuote/new_minute/one_minute/{date_str}.parquet"
    output_file = f"{output_dir}/{date_str}_limit_stocks.parquet"
    
    # 如果该日期的文件已存在，跳过处理
    if os.path.exists(output_file):
        print(f"日期 {date_str} 已处理，跳过")
        return date_str, True, 0  # 返回日期、成功标志和处理的股票数
    
    if not os.path.exists(one_min_pth):
        print(f"数据文件不存在: {one_min_pth}")
        return date_str, False, 0
    
    # 格式化日期为SQL查询格式
    sql_date = trading_date.strftime('%Y-%m-%d')
    conn_str = "mysql://lexuan_chen%40public%23Thetis:OWFF4UT!@192.168.55.161:2883/jydb"

    # 获取所有交易日的涨跌停信息
    sql_query1 = f"""
    SELECT 
        InnerCode,
        TradingDay as trading_day,
        PriceCeiling as price_ceiling,
        PriceFloor as price_floor
    FROM jydb.DZ_PriceLimit 
    WHERE DATE(TradingDay) = '{sql_date}'
    """

    sql_query2 = f"""
    SELECT 
        InnerCode,
        SecuCode AS security_code,
        TradingDay as trading_day
    FROM smartquant.ReturnDaily 
    WHERE DATE(TradingDay) = '{sql_date}'
    """

    try:
        daily_limit_df = cx.read_sql(conn_str, sql_query1)
        code_df = cx.read_sql(conn_str, sql_query2)
        
        daily_limit_df['trading_day'] = pd.to_datetime(daily_limit_df['trading_day'])
        code_df['trading_day'] = pd.to_datetime(code_df['trading_day'])

        daily_limit_df = pd.merge(daily_limit_df, code_df, on=['InnerCode','trading_day'], how='left')

        if daily_limit_df.empty:
            print(f"日期 {date_str} 没有涨跌停信息")
            return date_str, False, 0
        
        # 为每个进程创建独立的duckdb连接
        conn = duckdb.connect(database=':memory:')
        
        query_minute = f"""
        SELECT 
            security_code,
            MAX(high_price) AS high_price,
            MIN(low_price) AS low_price
        FROM '{one_min_pth}' 
        WHERE start_time < 1030
          AND start_time >= 930
        GROUP BY security_code 
        """
        
        early_mins = conn.execute(query_minute).fetchdf()
        
        if early_mins.empty:
            print(f"日期 {date_str} 没有符合条件的分钟数据")
            conn.close()
            return date_str, False, 0
        
        # 确保security_code列存在且不为空
        daily_limit_df = daily_limit_df.dropna(subset=['security_code'])
        
        limit_dict = dict(zip(daily_limit_df['security_code'], 
                            zip(daily_limit_df['price_ceiling'], daily_limit_df['price_floor'])))

        # 存储当日达到涨跌停的股票
        daily_limit_stocks = []

        # 检查哪些股票在时段内达到过涨跌停价格
        for security_code, group in early_mins.groupby('security_code'):  # 修正变量名
            if security_code in limit_dict:
                price_ceiling, price_floor = limit_dict[security_code]
                
                # 检查是否有价格达到涨跌停
                if (group['high_price'] >= price_ceiling).any() or (group['low_price'] <= price_floor).any():
                    daily_limit_stocks.append(security_code)
        
        # 如果有股票达到涨跌停，保存到当日文件
        if daily_limit_stocks:
            limit_stocks_df = pd.DataFrame({
                'date': [trading_date] * len(daily_limit_stocks),
                'security_code': daily_limit_stocks
            })
            limit_stocks_df.to_parquet(output_file)
            print(f"日期 {date_str} 处理完成，标记了 {len(daily_limit_stocks)} 只股票")
            conn.close()
            return date_str, True, len(daily_limit_stocks)
        else:
            # 创建空文件表示已处理但没有符合条件的股票
            pd.DataFrame(columns=['date', 'security_code']).to_parquet(output_file)
            print(f"日期 {date_str} 处理完成，没有股票达到涨跌停")
            conn.close()
            return date_str, True, 0
        
    except Exception as e:
        print(f"处理日期 {date_str} 时出错: {e}")
        traceback.print_exc()  # 打印详细错误信息
        return date_str, False, 0


def main():
    start_time = time.time()
    
    start_date = '2021-05-18'  
    end_date = '2024-12-31'    

    trading_dates = pd.date_range(start=start_date, end=end_date, freq='D')

    factor_path = "/data/home/lexuanchen/Factors/Order/Signal/early_order_size_ratio.csv"
    factor_df = pd.read_csv(factor_path)

    factor_df['date'] = pd.to_datetime(factor_df['date'])

    output_dir = "./factors/limit_stocks_daily"
    os.makedirs(output_dir, exist_ok=True)

    num_processes = 10

    print(f"使用 {num_processes} 个进程进行并行处理")

    # 创建进程池
    pool = mp.Pool(processes=num_processes)
    
    # 创建带有固定参数的函数
    process_date_with_args = partial(
        process_single_date,
        output_dir=output_dir
    )
    
    # 提交所有任务到进程池
    results = pool.map(process_date_with_args, trading_dates)
    
    # 关闭进程池
    pool.close()
    pool.join()
    
    processed_dates = [date_str for date_str, success, _ in results if success]
    processed_count = len(processed_dates)
    total_count = len(trading_dates)
    
    print(f"已处理 {processed_count}/{total_count} 个交易日 ({processed_count/total_count*100:.2f}%)")

    print("开始合并所有日期的结果...")
    all_files = glob.glob(f"{output_dir}/*_limit_stocks.parquet")
    all_limit_stocks = []

    for file in all_files:
        try:
            df = pd.read_parquet(file)
            if not df.empty:
                all_limit_stocks.append(df)
        except Exception as e:
            print(f"读取文件 {file} 时出错: {e}")

    if all_limit_stocks:
        # 合并所有日期的数据
        mark_df = pd.concat(all_limit_stocks, ignore_index=True)
        
        # 标记因子值为NaN
        factor_df['security_code'] = factor_df['security_code'].astype(str)
        mark_df['security_code'] = mark_df['security_code'].astype(str)

        factor_df = factor_df.merge(
            mark_df.assign(to_mark=True),
            on=['date', 'security_code'],
            how='left'
        )
        
        marked_count = factor_df['to_mark'].sum()
        factor_df.loc[factor_df['to_mark'] == True, 'early_order_size_ratio'] = np.nan
        
        factor_df = factor_df.drop('to_mark', axis=1)

        print(f"共标记了 {marked_count} 个涨跌停股票的因子值为NaN")

        new_factor_path = "/data/home/lexuanchen/Factors/Order/Signal/hl_early_order_size_ratio.csv"
        factor_df.to_csv(new_factor_path, index=False)
        print(f"已将更新后的因子数据保存到 {new_factor_path}")
    else:
        print("没有找到需要标记为NaN的股票")
    
    end_time = time.time()
    print(f"总运行时间: {(end_time - start_time)/60:.2f} 分钟")

if __name__ == "__main__":
    main()


In [9]:
#把每天的因子值和同时间段的收益率回归取残差 
def derive_hl_ret_factor():
    factor_name = "early_order_size_ratio"
    output_dir = "./factors/limit_stocks_daily"
    all_files = glob.glob(f"{output_dir}/*_limit_stocks.parquet")
    all_limit_stocks = []

    factor_path = "/data/home/lexuanchen/Factors/Order/Signal"
    factor_dir = f"{factor_path}/residual_{factor_name}.csv"
    factor_df = pd.read_csv(factor_dir)

    for file in all_files:
        try:
            df = pd.read_parquet(file)
            if not df.empty:
                all_limit_stocks.append(df)
        except Exception as e:
            print(f"读取文件 {file} 时出错: {e}")

    if all_limit_stocks:
        
        # 合并所有日期的数据
        mark_df = pd.concat(all_limit_stocks, ignore_index=True)
        
        # 确保两个数据框中的日期列都是相同的类型(datetime64[ns])
        factor_df['date'] = pd.to_datetime(factor_df['date'])
        mark_df['date'] = pd.to_datetime(mark_df['date'])
        
        # 确保security_code列都是字符串类型
        factor_df['security_code'] = factor_df['security_code'].astype(str)
        mark_df['security_code'] = mark_df['security_code'].astype(str)

        factor_df = factor_df.merge(
            mark_df.assign(to_mark=True),
            on=['date', 'security_code'],
            how='left'
        )
        
        marked_count = factor_df['to_mark'].sum()
        factor_df.loc[factor_df['to_mark'] == True, 'residual_factor'] = np.nan
        
        factor_df = factor_df.drop('to_mark', axis=1)

        print(f"共标记了 {marked_count} 个涨跌停股票的因子值为NaN")

        new_factor_path = f"{factor_path}/hl_residual_early_order_size_ratio.csv"
        factor_df.to_csv(new_factor_path, index=False)
        print(f"已将更新后的因子数据保存到 {new_factor_path}")

    
    return factor_df


if __name__ == "__main__":
    derive_hl_ret_factor()


共标记了 37886 个涨跌停股票的因子值为NaN
已将更新后的因子数据保存到 /data/home/lexuanchen/Factors/Order/Signal/hl_residual_early_order_size_ratio.csv
