In [1]:
import connectorx as cx
import pandas as pd
import numpy as np
import os
import duckdb
from datetime import datetime
import glob
import multiprocessing as mp
from functools import partial
import time
from scipy import stats

In [16]:
adj_path = "/data/home/lexuanchen/.conda/envs/Order_Improvement/Daily_Adjusting_Factor/20220909.parquet"

adj_factors = pd.read_parquet(adj_path)

print(adj_factors.head())

   InnerCode ExDiviDate  AdjustingFactor security_code
0          3 2022-07-22       150.694694        000001
1          6 2022-08-25       138.141483        000002
2         11 1996-07-27         4.797420        000003
3         14 2006-08-18         5.887560        000004
4         17 2006-07-31         9.833603        000005


In [17]:
print(adj_factors[adj_factors['security_code']=='688981'])

Empty DataFrame
Columns: [InnerCode, ExDiviDate, AdjustingFactor, security_code]
Index: []


In [18]:
print(len(adj_factors))

4791


In [6]:
order_path = "/data/home/lexuanchen/Factors/Order/Signal/Sample_Data/APB/Adjusted_1d_APB_20220909.parquet"

order_df = pd.read_parquet(order_path)

print(len(order_df))

print(order_df[order_df['security_code']=='688981'])

4797
          date security_code  daily_value_p01_p99  daily_sum_price_p01_p99  \
4795  20220909        688981         5.629170e+08                363303.89   

      daily_volume_p01_p99  daily_count_p01_p99  daily_value_p05_p95  \
4795            14058130.0                 9077          563521545.2   

      daily_sum_price_p05_p95  daily_volume_p05_p95  daily_count_p05_p95  \
4795                363786.13            14058130.0                 9077   

      adj_value_p01_p99  adj_price_p01_p99  adj_value_p05_p95  \
4795                NaN                NaN                NaN   

      adj_price_p05_p95  
4795                NaN  


In [None]:
print(len(order_df[~order_df['security_code'].isnull()]))

4797


In [19]:
# 找出 order_df 中 security_code 也存在于 adjust_df 的股票数量
common_count = order_df['security_code'].isin(adj_factors['security_code']).sum()
print(f"order_df 中有 {common_count} 只股票存在于 adjust_df 中")

order_df 中有 4599 只股票存在于 adjust_df 中


In [10]:
orig_order = pd.read_parquet("./factors/DD_Bid_Percent_adjusted_APB/20220909.parquet")

print(len(orig_order))

4797


In [25]:
import os
import glob
import pandas as pd
import multiprocessing
from tqdm import tqdm
import time
from datetime import datetime

def get_adjust_price(df_close, adj_factors):
    """
    使用复权因子调整价格（只对价格相关字段进行复权，不对交易量进行复权）
    
    参数:
    df_close: 包含价格数据的DataFrame，包含p01-p99和p05-p95两种缩尾处理的结果
    adj_factors: 当日对应的复权因子DataFrame
    
    返回:
    调整后的价格DataFrame
    """
    
    # 保存原始日期格式
    original_date_type = df_close['date'].dtype
    original_date_format = None
    if original_date_type == 'object':
        # 如果是字符串，记录格式
        sample_date = df_close['date'].iloc[0]
        if isinstance(sample_date, str):
            if '-' in sample_date:
                original_date_format = '%Y-%m-%d'
            else:
                original_date_format = '%Y%m%d'
    
    # 转换为日期类型进行计算
    if original_date_type == 'object':
        df_close['date'] = pd.to_datetime(df_close['date'])
    
    df_close['IsReadjusted'] = False
    
    # 创建需要复权的字段副本
    # 对p01-p99缩尾处理的价格相关字段进行复权
    df_close['adj_value_p01_p99'] = df_close['daily_value_p01_p99']
    df_close['adj_price_p01_p99'] = df_close['daily_sum_price_p01_p99']
    
    # 对p05-p95缩尾处理的价格相关字段进行复权
    df_close['adj_value_p05_p95'] = df_close['daily_value_p05_p95']
    df_close['adj_price_p05_p95'] = df_close['daily_sum_price_p05_p95']
    
    # 只对价格相关字段进行复权
    cols2adj = [
        'adj_value_p01_p99', 'adj_price_p01_p99',
        'adj_value_p05_p95', 'adj_price_p05_p95'
    ]
    
    security_code = df_close['security_code'].iloc[0]
    # 从复权因子表中查找对应的股票
    df_adj_spec = adj_factors[adj_factors.security_code == security_code]
    
    if df_adj_spec.shape[0] == 0:
        print(f"adj_factors中没有{security_code}数据，返回原始数据")
        return df_close, False
    
    # 按照复权日期排序
    df_adj_spec = df_adj_spec.sort_values(by='ExDiviDate', ascending=False)
    
    # 应用复权因子
    for d, f in df_adj_spec[['ExDiviDate', 'AdjustingFactor']].values:
        # 确保d是日期类型
        if not isinstance(d, pd.Timestamp):
            d = pd.to_datetime(d)
        
        mask = (df_close.date >= d) & ~df_close.IsReadjusted
        df_close.loc[mask, cols2adj] = df_close.loc[mask, cols2adj] * f
        df_close.loc[mask, 'IsReadjusted'] = True
    
    del df_close['IsReadjusted']
    
    # 转换回原始日期格式
    if original_date_format:
        df_close['date'] = df_close['date'].dt.strftime(original_date_format)
    
    return df_close, True

In [26]:
if 'ExDiviDate' in adj_factors.columns:
        adj_factors['ExDiviDate'] = pd.to_datetime(adj_factors['ExDiviDate'])
        
# 处理数据 - 对每个股票应用复权因子

result_dfs = []

noadj_count = 0

for code, group in order_df.groupby('security_code'):
    adjusted_group, lable = get_adjust_price(group.copy(), adj_factors)
    if not lable:
        noadj_count += 1
    result_dfs.append(adjusted_group)

daily_adjust = pd.concat(result_dfs, ignore_index=True)
        

adj_factors中没有001222数据，返回原始数据
adj_factors中没有001226数据，返回原始数据
adj_factors中没有001228数据，返回原始数据
adj_factors中没有001229数据，返回原始数据
adj_factors中没有001230数据，返回原始数据
adj_factors中没有001231数据，返回原始数据
adj_factors中没有001236数据，返回原始数据
adj_factors中没有001258数据，返回原始数据
adj_factors中没有001259数据，返回原始数据
adj_factors中没有001268数据，返回原始数据
adj_factors中没有001270数据，返回原始数据
adj_factors中没有001283数据，返回原始数据
adj_factors中没有001309数据，返回原始数据
adj_factors中没有001316数据，返回原始数据
adj_factors中没有001318数据，返回原始数据
adj_factors中没有001323数据，返回原始数据
adj_factors中没有001330数据，返回原始数据
adj_factors中没有001331数据，返回原始数据
adj_factors中没有001336数据，返回原始数据
adj_factors中没有001339数据，返回原始数据
adj_factors中没有300114数据，返回原始数据
adj_factors中没有300842数据，返回原始数据
adj_factors中没有300949数据，返回原始数据
adj_factors中没有301030数据，返回原始数据
adj_factors中没有301045数据，返回原始数据
adj_factors中没有301095数据，返回原始数据
adj_factors中没有301107数据，返回原始数据
adj_factors中没有301112数据，返回原始数据
adj_factors中没有301115数据，返回原始数据
adj_factors中没有301121数据，返回原始数据
adj_factors中没有301125数据，返回原始数据
adj_factors中没有301132数据，返回原始数据
adj_factors中没有301139数据，返回原始数据
adj_factor

In [27]:
print(daily_adjust[daily_adjust['security_code']=='688981']) 

print(noadj_count)

                     date security_code  daily_value_p01_p99  \
4795  2022-09-09 00:00:00        688981         5.629170e+08   

      daily_sum_price_p01_p99  daily_volume_p01_p99  daily_count_p01_p99  \
4795                363303.89            14058130.0                 9077   

      daily_value_p05_p95  daily_sum_price_p05_p95  daily_volume_p05_p95  \
4795          563521545.2                363786.13            14058130.0   

      daily_count_p05_p95  adj_value_p01_p99  adj_price_p01_p99  \
4795                 9077       5.629170e+08          363303.89   

      adj_value_p05_p95  adj_price_p05_p95 IsReadjusted  
4795        563521545.2          363786.13        False  
198


In [29]:
import os
import glob
import pandas as pd
import multiprocessing
from tqdm import tqdm
import time
from datetime import datetime


def get_adjust_price(df_close, adj_factors):
    """
    使用复权因子调整价格（只对价格相关字段进行复权，不对交易量进行复权）
    
    参数:
    df_close: 包含价格数据的DataFrame，包含p01-p99和p05-p95两种缩尾处理的结果
    adj_factors: 当日对应的复权因子DataFrame
    
    返回:
    调整后的价格DataFrame
    """
    
    # 保存原始日期格式
    original_date_type = df_close['date'].dtype
    original_date_format = None
    if original_date_type == 'object':
        # 如果是字符串，记录格式
        sample_date = df_close['date'].iloc[0]
        if isinstance(sample_date, str):
            if '-' in sample_date:
                original_date_format = '%Y-%m-%d'
            else:
                original_date_format = '%Y%m%d'
    
    # 转换为日期类型进行计算
    if original_date_type == 'object':
        df_close['date'] = pd.to_datetime(df_close['date'])
    
    df_close['IsReadjusted'] = False
    
    # 创建需要复权的字段副本
    # 对p01-p99缩尾处理的价格相关字段进行复权
    df_close['adj_value_p01_p99'] = df_close['daily_value_p01_p99']
    df_close['adj_price_p01_p99'] = df_close['daily_sum_price_p01_p99']
    
    # 对p05-p95缩尾处理的价格相关字段进行复权
    df_close['adj_value_p05_p95'] = df_close['daily_value_p05_p95']
    df_close['adj_price_p05_p95'] = df_close['daily_sum_price_p05_p95']
    
    # 只对价格相关字段进行复权
    cols2adj = [
        'adj_value_p01_p99', 'adj_price_p01_p99',
        'adj_value_p05_p95', 'adj_price_p05_p95'
    ]
    
    security_code = df_close['security_code'].iloc[0]
    # 从复权因子表中查找对应的股票
    df_adj_spec = adj_factors[adj_factors.security_code == security_code]
    
    if df_adj_spec.shape[0] == 0:
        print(f"adj_factors中没有{security_code}数据，返回原始数据")
        return df_close
    
    # 按照复权日期排序
    df_adj_spec = df_adj_spec.sort_values(by='ExDiviDate', ascending=False)
    
    # 应用复权因子
    for d, f in df_adj_spec[['ExDiviDate', 'AdjustingFactor']].values:
        # 确保d是日期类型
        if not isinstance(d, pd.Timestamp):
            d = pd.to_datetime(d)
        
        mask = (df_close.date >= d) & ~df_close.IsReadjusted
        df_close.loc[mask, cols2adj] = df_close.loc[mask, cols2adj] * f
        df_close.loc[mask, 'IsReadjusted'] = True
    
    del df_close['IsReadjusted']
    
    # 转换回原始日期格式
    if original_date_format:
        df_close['date'] = df_close['date'].dt.strftime(original_date_format)
    
    return df_close



def process_file(file_path):
    """处理单个文件的函数"""
    try:
        # 检查是否已处理过
        file_name = os.path.basename(file_path)
        marker_file = os.path.join(os.path.dirname(file_path), ".processed", file_name + ".done")
        
        # 如果标记文件存在，说明已处理过
        if os.path.exists(marker_file):
            return {"file": file_path, "status": "skipped", "message": "Already processed"}
        
        # 读取数据
        daily_data = pd.read_parquet(file_path)
        
        # 检查数据内容是否已经处理过
        if 'adj_value' in daily_data.columns:
            return {"file": file_path, "status": "skipped", "message": "Data already has adjusted columns"}
        
        # 保存原始数据的schema
        original_schema = {col: daily_data[col].dtype for col in daily_data.columns}
        
        # 获取日期信息
        date_value = daily_data['date'].iloc[0]
        if isinstance(date_value, str):
            # 如果日期格式是'YYYY-MM-DD'，转换为'YYYYMMDD'
            if '-' in date_value:
                date_obj = datetime.strptime(date_value, '%Y-%m-%d')
                date_str = date_obj.strftime('%Y%m%d')
            else:
                date_str = date_value
        else:
            # 如果是Timestamp或其他日期类型
            date_str = pd.to_datetime(date_value).strftime('%Y%m%d')
        
        # 加载对应日期的复权因子
        adj_factor_path = f"./Daily_Adjusting_Factor/{date_str}.parquet"
        
        if not os.path.exists(adj_factor_path):
            return {"file": file_path, "status": "error", "message": f"Adjustment factor file not found for date {date_str}"}
        
        # 读取复权因子数据
        adj_factors = pd.read_parquet(adj_factor_path)
        
        # 确保ExDiviDate列是日期格式
        if 'ExDiviDate' in adj_factors.columns:
            adj_factors['ExDiviDate'] = pd.to_datetime(adj_factors['ExDiviDate'])
        
        # 处理数据 - 对每个股票应用复权因子
        result_dfs = []
        for code, group in daily_data.groupby('security_code'):
            adjusted_group = get_adjust_price(group.copy(), adj_factors)
            result_dfs.append(adjusted_group)
        
        daily_adjust = pd.concat(result_dfs, ignore_index=True)
        
        # 确保数据类型与原始数据一致
        for col in original_schema:
            if col in daily_adjust.columns:
                try:
                    daily_adjust[col] = daily_adjust[col].astype(original_schema[col])
                except:
                    # 如果无法转换，保留当前类型
                    pass
        
        # 尝试将日期列转换为整数，如果原始格式是整数
        try:
            if original_schema['date'] == 'int64':
                daily_adjust['date'] = daily_adjust['date'].astype('int64')
            elif original_schema['date'] == 'int32':
                daily_adjust['date'] = daily_adjust['date'].astype('int32')
        except:
            # 如果无法转换，尝试其他方法
            try:
                if isinstance(daily_adjust['date'].iloc[0], str) and daily_adjust['date'].iloc[0].isdigit():
                    daily_adjust['date'] = daily_adjust['date'].astype('int64')
            except:
                pass
        
        # 保存处理后的数据
        daily_adjust['date'] = daily_adjust['date'].astype(str)
        # 使用pyarrow引擎并指定schema
        daily_adjust.to_parquet(
            file_path, 
            index=False,
            engine='pyarrow',
            compression='snappy'
        )
        
        # 创建处理完成标记
        os.makedirs(os.path.dirname(marker_file), exist_ok=True)
        with open(marker_file, 'w') as f:
            f.write(f"Processed at {time.strftime('%Y-%m-%d %H:%M:%S')}")
        
        return {"file": file_path, "status": "success", "message": "Processed successfully"}
    
    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        return {"file": file_path, "status": "error", "message": f"{str(e)}\n{error_details}"}


def Adjust_price(num_processes=None):
    """使用多进程调整价格数据"""
    # 设置目录
    data_dir = "./factors/Test_DD_Bid_Percent_adjusted_APB"
    
    # 创建处理标记目录
    processed_dir = os.path.join(data_dir, ".processed")
    os.makedirs(processed_dir, exist_ok=True)
    
    # 获取所有parquet文件
    parquet_files = sorted(glob.glob(os.path.join(data_dir, "*.parquet")))
    
    if not parquet_files:
        print(f"没有找到parquet文件在目录: {data_dir}")
        return
    
    print(f"找到 {len(parquet_files)} 个文件需要处理")
    
    # 检查复权因子目录是否存在
    adj_factor_dir = "./Daily_Adjusting_Factor"
    if not os.path.exists(adj_factor_dir):
        print(f"错误: 复权因子目录不存在: {adj_factor_dir}")
        return
    
    # 设置进程数
    if num_processes is None:
        num_processes = min(20, multiprocessing.cpu_count())
    
    # 创建进程池
    pool = multiprocessing.Pool(processes=num_processes)
    
    try:
        # 使用tqdm显示进度
        results = []
        for result in tqdm(pool.imap_unordered(process_file, parquet_files), 
                          total=len(parquet_files), 
                          desc="处理文件"):
            results.append(result)
        
        # 关闭进程池
        pool.close()
        pool.join()
        
        # 统计处理结果
        success_count = sum(1 for r in results if r["status"] == "success")
        skipped_count = sum(1 for r in results if r["status"] == "skipped")
        error_count = sum(1 for r in results if r["status"] == "error")
        
        print(f"处理完成: 成功 {success_count}, 跳过 {skipped_count}, 错误 {error_count}")
        
        # 如果有错误，打印错误详情
        if error_count > 0:
            print("\n错误详情:")
            for r in results:
                if r["status"] == "error":
                    print(f"  {r['file']}: {r['message']}")
        
    except KeyboardInterrupt:
        print("\n处理被中断，正在清理资源...")
        pool.terminate()
        pool.join()
    except Exception as e:
        print(f"发生错误: {str(e)}")
        pool.terminate()
        pool.join()

# 使用示例
if __name__ == "__main__":
    Adjust_price(num_processes=10)


找到 10 个文件需要处理


处理文件:   0%|          | 0/10 [00:00<?, ?it/s]

adj_factors中没有000022数据，返回原始数据adj_factors中没有000022数据，返回原始数据
adj_factors中没有000022数据，返回原始数据

adj_factors中没有000022数据，返回原始数据
adj_factors中没有000022数据，返回原始数据
adj_factors中没有000043数据，返回原始数据
adj_factors中没有000043数据，返回原始数据
adj_factors中没有000043数据，返回原始数据
adj_factors中没有000043数据，返回原始数据
adj_factors中没有000043数据，返回原始数据
adj_factors中没有000166数据，返回原始数据
adj_factors中没有000166数据，返回原始数据
adj_factors中没有000166数据，返回原始数据
adj_factors中没有000166数据，返回原始数据
adj_factors中没有000166数据，返回原始数据
adj_factors中没有001979数据，返回原始数据
adj_factors中没有001979数据，返回原始数据
adj_factors中没有001979数据，返回原始数据
adj_factors中没有001979数据，返回原始数据
adj_factors中没有001222数据，返回原始数据
adj_factors中没有001226数据，返回原始数据
adj_factors中没有001228数据，返回原始数据
adj_factors中没有001222数据，返回原始数据adj_factors中没有001222数据，返回原始数据adj_factors中没有001229数据，返回原始数据


adj_factors中没有001226数据，返回原始数据adj_factors中没有001226数据，返回原始数据adj_factors中没有001230数据，返回原始数据


adj_factors中没有001222数据，返回原始数据
adj_factors中没有001231数据，返回原始数据
adj_factors中没有001226数据，返回原始数据
adj_factors中没有001228数据，返回原始数据
adj_factors中没有001229数据，返回原始数据
adj_factor

处理文件:  50%|█████     | 5/10 [00:12<00:09,  1.88s/it]

adj_factors中没有300842数据，返回原始数据
adj_factors中没有300842数据，返回原始数据
adj_factors中没有300949数据，返回原始数据
adj_factors中没有300949数据，返回原始数据
adj_factors中没有301030数据，返回原始数据
adj_factors中没有301030数据，返回原始数据
adj_factors中没有301045数据，返回原始数据
adj_factors中没有301045数据，返回原始数据
adj_factors中没有301095数据，返回原始数据
adj_factors中没有301095数据，返回原始数据
adj_factors中没有301107数据，返回原始数据
adj_factors中没有301107数据，返回原始数据
adj_factors中没有301112数据，返回原始数据
adj_factors中没有301115数据，返回原始数据
adj_factors中没有301112数据，返回原始数据
adj_factors中没有301115数据，返回原始数据
adj_factors中没有301121数据，返回原始数据
adj_factors中没有301125数据，返回原始数据adj_factors中没有301121数据，返回原始数据

adj_factors中没有301125数据，返回原始数据
adj_factors中没有301132数据，返回原始数据
adj_factors中没有301132数据，返回原始数据
adj_factors中没有301139数据，返回原始数据
adj_factors中没有301139数据，返回原始数据
adj_factors中没有301152数据，返回原始数据
adj_factors中没有301153数据，返回原始数据
adj_factors中没有301156数据，返回原始数据
adj_factors中没有301152数据，返回原始数据
adj_factors中没有301153数据，返回原始数据
adj_factors中没有301156数据，返回原始数据adj_factors中没有301160数据，返回原始数据

adj_factors中没有301163数据，返回原始数据
adj_factors中没有301160数据，返回原始数据
adj_factor

处理文件: 100%|██████████| 10/10 [00:26<00:00,  2.69s/it]

处理完成: 成功 10, 跳过 0, 错误 0





In [30]:
new_order_path ="/data/home/lexuanchen/.conda/envs/Order_Improvement/factors/Test_DD_Bid_Percent_adjusted_APB/20220905.parquet"

new_order_df = pd.read_parquet(new_order_path)

print(len(new_order_df))

print(new_order_df[new_order_df['security_code']=='688981'])

4789
                     date security_code  daily_value_p01_p99  \
4787  2022-09-05 00:00:00        688981         5.716488e+08   

      daily_sum_price_p01_p99  daily_volume_p01_p99  daily_count_p01_p99  \
4787                 449658.1            14370941.0                11311   

      daily_value_p05_p95  daily_sum_price_p05_p95  daily_volume_p05_p95  \
4787         5.723410e+08                450252.02            14370941.0   

      daily_count_p05_p95  adj_value_p01_p99  adj_price_p01_p99  \
4787                11311       5.716488e+08           449658.1   

      adj_value_p05_p95  adj_price_p05_p95 IsReadjusted  
4787       5.723410e+08          450252.02        False  


In [31]:
import os
import glob
import pandas as pd
import numpy as np
import logging
import traceback
import multiprocessing
from functools import partial
import gc

# 假设logger已经在其他地方定义
logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)


def process_single_date(file_dates, i, window_size=5,  output_dir="./factors/Test_DD_Bid_Percent_5d_rolling_APB"):
    """处理单个日期的数据，分别计算p01-p99和p05-p95两种缩尾方法的APB因子"""
    current_date, current_file = file_dates[i]
    logger.info(f"处理日期 {current_date} 的数据")
    
    # 检查输出文件是否已存在
    output_file = os.path.join(output_dir, f"{current_date}.parquet")
    
    if os.path.exists(output_file):
        logger.info(f"日期 {current_date} 的结果文件已存在，跳过处理")
        return current_date, True
    
    try:
        # 获取当前日期和前几天的文件
        start_idx = max(0, i-window_size+1)  # 确保不会索引到负数
        recent_files = [path for _, path in file_dates[start_idx:i+1]]
        
        # 读取这些天的数据
        dfs = []
        for file_path in recent_files:
            try:
                df = pd.read_parquet(file_path)
                dfs.append(df)
            except Exception as e:
                logger.error(f"读取文件 {file_path} 时出错: {str(e)}")
                continue
        
        if not dfs:
            logger.warning(f"日期 {current_date} 没有有效数据，跳过")
            return current_date, False
        
        # 合并数据
        combined_data = pd.concat(dfs, ignore_index=True)
        combined_data.sort_values(['security_code', 'date'], inplace=True)
        
        # 创建结果DataFrame
        result_df = pd.DataFrame({'security_code': combined_data['security_code'].unique()})
        result_df['date'] = current_date
        
        # 计算p01-p99缩尾处理的vwap和twap
        p01_p99_vwap = combined_data.groupby('security_code').apply(
            lambda x: x['adj_value_p01_p99'].sum() / x['daily_volume_p01_p99'].sum()
        ).reset_index()
        p01_p99_vwap.columns = ['security_code', 'adj_vwap_p01_p99']
        
        p01_p99_twap = combined_data.groupby('security_code').apply(
            lambda x: x['adj_price_p01_p99'].sum() / x['daily_count_p01_p99'].sum()
        ).reset_index()
        p01_p99_twap.columns = ['security_code', 'adj_twap_p01_p99']
        
        # 计算p05-p95缩尾处理的vwap和twap
        p05_p95_vwap = combined_data.groupby('security_code').apply(
            lambda x: x['adj_value_p05_p95'].sum() / x['daily_volume_p05_p95'].sum()
        ).reset_index()
        p05_p95_vwap.columns = ['security_code', 'adj_vwap_p05_p95']
        
        p05_p95_twap = combined_data.groupby('security_code').apply(
            lambda x: x['adj_price_p05_p95'].sum() / x['daily_count_p05_p95'].sum()
        ).reset_index()
        p05_p95_twap.columns = ['security_code', 'adj_twap_p05_p95']
        
        # 合并所有结果
        result_df = pd.merge(result_df, p01_p99_vwap, on='security_code', how='left')
        result_df = pd.merge(result_df, p01_p99_twap, on='security_code', how='left')
        result_df = pd.merge(result_df, p05_p95_vwap, on='security_code', how='left')
        result_df = pd.merge(result_df, p05_p95_twap, on='security_code', how='left')
        
        # 计算两种缩尾处理方法的APB因子
        result_df['5d_apb_p01_p99'] = np.log(result_df['adj_twap_p01_p99'] / result_df['adj_vwap_p01_p99'])
        result_df['5d_apb_p05_p95'] = np.log(result_df['adj_twap_p05_p95'] / result_df['adj_vwap_p05_p95'])
        
        # 处理无穷大值
        result_df['5d_apb_p01_p99'].replace([-np.inf, np.inf], np.nan, inplace=True)
        result_df['5d_apb_p05_p95'].replace([-np.inf, np.inf], np.nan, inplace=True)
        
        # 去除中间计算列，只保留最终因子值
        result_df = result_df[['security_code', 'date', '5d_apb_p01_p99', '5d_apb_p05_p95']]
        
        # 确保输出目录存在
        os.makedirs(output_dir, exist_ok=True)
        
        # 保存结果
        result_df.to_parquet(output_file)
        
        return current_date, True
        
    except Exception as e:
        logger.error(f"处理日期 {current_date} 时发生错误: {str(e)}")
        logger.error(traceback.format_exc())
        return current_date, False


def calculate_rolling_average(window_size=5, start_date=None, end_date=None, num_processes=15):
    """计算滚动平均APB因子，采用多进程处理方式"""
    data_dir = "./factors/Test_DD_Bid_Percent_adjusted_APB"  # 更新为新的数据目录
    parquet_files = sorted(glob.glob(os.path.join(data_dir, "*.parquet")))

    if not parquet_files:
        raise ValueError("未找到日度数据文件")

    logger.info(f"找到 {len(parquet_files)} 个日度数据文件")
    
    # 假设文件名是日期格式，如 "20210602.parquet"
    # 提取文件名中的日期并排序
    file_dates = []
    for file_path in parquet_files:
        file_name = os.path.basename(file_path)
        date_str = file_name.split('.')[0]  # 假设文件名格式为 "YYYYMMDD.parquet"
        file_dates.append((date_str, file_path))
    
    # 按日期排序
    file_dates.sort(key=lambda x: x[0])
    
    # 如果指定了日期范围，筛选文件
    if start_date and end_date:
        file_dates = [(date, path) for date, path in file_dates 
                      if start_date <= date <= end_date]
    
    if not file_dates:
        raise ValueError("筛选后没有符合条件的数据文件")

    output_dir = "./factors/Test_DD_Bid_Percent_5d_rolling_APB"  # 更新为新的输出目录

    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)

    # 准备要处理的日期索引列表（从第window_size个文件开始，确保有前window_size-1天的数据）
    indices_to_process = list(range(window_size-1, len(file_dates)))
    
    # 使用多进程处理
    with multiprocessing.Pool(processes=num_processes) as pool:
        # 使用partial固定其他参数，只让索引i变化
        process_func = partial(
            process_single_date, 
            file_dates, 
            window_size=window_size, 
            output_dir=output_dir
        )
        
        # 并行处理所有日期
        results = pool.map(process_func, indices_to_process)
    
    # 处理结果
    success_count = sum(1 for _, success in results if success)
    logger.info(f"处理完成，成功处理 {success_count} 个日期，总共 {len(indices_to_process)} 个日期")
    
    return output_dir

# 如果直接运行此脚本
if __name__ == "__main__":
    calculate_rolling_average(window_size=5, num_processes=15)

2025-08-18 15:08:03,561 - __main__ - INFO - 找到 10 个日度数据文件
2025-08-18 15:08:03,666 - __main__ - INFO - 处理日期 20220906 的数据
2025-08-18 15:08:03,666 - __main__ - INFO - 处理日期 20220909 的数据
2025-08-18 15:08:03,666 - __main__ - INFO - 处理日期 20220907 的数据
2025-08-18 15:08:03,665 - __main__ - INFO - 处理日期 20160624 的数据
2025-08-18 15:08:03,666 - __main__ - INFO - 处理日期 20220908 的数据
2025-08-18 15:08:03,666 - __main__ - INFO - 处理日期 20220905 的数据
  p01_p99_vwap = combined_data.groupby('security_code').apply(
  p01_p99_vwap = combined_data.groupby('security_code').apply(
  p01_p99_vwap = combined_data.groupby('security_code').apply(
  p01_p99_vwap = combined_data.groupby('security_code').apply(
  p01_p99_vwap = combined_data.groupby('security_code').apply(
  p01_p99_vwap = combined_data.groupby('security_code').apply(
  p01_p99_twap = combined_data.groupby('security_code').apply(
  p05_p95_vwap = combined_data.groupby('security_code').apply(
  p01_p99_twap = combined_data.groupby('security_code').apply(
  p

In [32]:
final_order = pd.read_parquet("/data/home/lexuanchen/.conda/envs/Order_Improvement/factors/Test_DD_Bid_Percent_5d_rolling_APB/20160624.parquet")

print(final_order.head())

  security_code      date  5d_apb_p01_p99  5d_apb_p05_p95
0        000001  20160624       -0.002055       -0.001680
1        000005  20160624       -0.003008       -0.002752
2        000006  20160624       -0.002040       -0.001943
3        000008  20160624       -0.002672       -0.002715
4        000009  20160624       -0.003374       -0.003196
