In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import os
from datetime import datetime
from tqdm import tqdm
import multiprocessing as mp
from functools import partial

In [17]:
# 创建到jydb数据库的连接
def get_engine():
    return create_engine("mysql://lexuan_chen%40public%23Thetis:OWFF4UT!@192.168.55.161:2883/jydb?charset=utf8mb4")

# 处理单个日期的函数
def process_date(date, engine=None):
    if engine is None:
        engine = get_engine()

    output_dir = "./Daily_Adjusting_Factor/"
    os.makedirs(output_dir, exist_ok=True)
    
    date_str = date.strftime("%Y-%m-%d")
    file_name = date.strftime("%Y%m%d") + ".parquet"
    file_path = os.path.join(output_dir, file_name)
    
    # 如果文件已存在，跳过处理
    if os.path.exists(file_path):
        return f"{date_str} 已存在，跳过处理"
    
    # 对特定日期查询复权因子
    sql = f'''
    SELECT 
        a.InnerCode, 
        a.ExDiviDate, 
        a.RatioAdjustingFactor as AdjustingFactor,
        b.SecuCode AS security_code
    FROM 
        jydb.DZ_AdjustingFactor a 
    JOIN 
        jydb.SecuMain b ON a.InnerCode = b.InnerCode
    WHERE 
        b.SecuCategory = 1  -- 股票
        AND b.SecuMarket IN (83, 90)  -- 上海和深圳市场
        AND a.ExDiviDate <= '{date_str}'
    '''
    
    try:
        # 直接使用pandas读取SQL结果
        with engine.connect() as conn:
            df_adj = pd.read_sql(sql, conn)
            
        if len(df_adj) > 0:
            # 确保数据类型正确
            df_adj['ExDiviDate'] = pd.to_datetime(df_adj['ExDiviDate'])
            
            # 转换所有字符串列为ASCII兼容格式
            for col in df_adj.select_dtypes(include=['object']):
                df_adj[col] = df_adj[col].apply(lambda x: str(x).encode('ascii', 'ignore').decode('ascii') if x is not None else x)
            
            # 对每个股票取最近的复权因子
            df_adj = df_adj.sort_values(['InnerCode', 'ExDiviDate'])
            df_latest = df_adj.groupby('InnerCode').last().reset_index()
            
            # 使用fastparquet引擎保存，它对编码问题更宽容
            try:
                df_latest.to_parquet(file_path, index=False, engine='fastparquet')
                return f"{date_str} 处理成功"
            except:
                # 如果fastparquet失败，尝试使用pyarrow引擎并指定编码
                try:
                    df_latest.to_parquet(file_path, index=False, engine='pyarrow')
                    return f"{date_str} 处理成功"
                except:
                    # 如果parquet格式都失败，尝试保存为CSV
                    df_latest.to_csv(file_path.replace('.parquet', '.csv'), index=False, encoding='ascii')
                    return f"{date_str} 处理成功(保存为CSV)"
        else:
            return f"{date_str} 无数据"
    except Exception as e:
        # 使用ASCII编码处理错误信息，确保它可以被打印
        error_msg = str(e).encode('ascii', 'ignore').decode('ascii')
        return f"{date_str} 处理失败: {error_msg}"


In [18]:
def main():

    # 获取交易日历
    engine = create_engine("mysql://lexuan_chen%40public%23Thetis:OWFF4UT!@192.168.55.161:2883/jydb")
    with engine.connect() as conn:
        print("正在获取交易日历...")
        trading_days = pd.read_sql(text('''
            SELECT DISTINCT TradingDate
            FROM jydb.QT_TradingDayNew 
            WHERE IfTradingDay=1 AND SecuMarket IN (83, 90)
            AND TradingDate <= '2025-06-23'
            AND TradingDate >= '2015-01-01'
            ORDER BY TradingDate
        '''), conn)
    
    trading_days['TradingDate'] = pd.to_datetime(trading_days['TradingDate'])
    print(f"获取到 {len(trading_days)} 个交易日")
    
    # 使用多进程处理
    num_processes = min(mp.cpu_count(), 10)  # 限制最大进程数
    print(f"使用 {num_processes} 个进程并行处理")
    
    # 将日期列表分成多个批次，每批次处理一部分日期
    dates = trading_days['TradingDate'].tolist()
    
    # 创建进程池
    with mp.Pool(num_processes) as pool:
        results = list(tqdm(pool.imap(process_date, dates), total=len(dates)))
    
    # 打印结果统计
    success = sum(1 for r in results if "处理成功" in r)
    skipped = sum(1 for r in results if "已存在" in r)
    failed = sum(1 for r in results if "处理失败" in r)
    no_data = sum(1 for r in results if "无数据" in r)
    
    print(f"处理完成: 成功 {success}, 跳过 {skipped}, 失败 {failed}, 无数据 {no_data}")

if __name__ == "__main__":
    main()


正在获取交易日历...
获取到 2543 个交易日
使用 10 个进程并行处理


100%|██████████| 2543/2543 [01:14<00:00, 34.23it/s]


处理完成: 成功 2543, 跳过 0, 失败 0, 无数据 0
