In [1]:
import connectorx as cx
import pandas as pd
import numpy as np
import os
import duckdb
from datetime import datetime
import glob
import multiprocessing as mp
from functools import partial
import time

from tqdm import tqdm

In [2]:
path = "/data/home/lexuanchen/Factors/Order/Signal/Improved_Early_Order_Size_Ratio_3_2/early_930_1030_net_order_volume_ratio.csv"

df = pd.read_csv(path)

df.head()

Unnamed: 0,date,security_code,early_930_1030_net_order_volume_ratio
0,2016-06-20,1,
1,2016-06-20,5,
2,2016-06-20,6,
3,2016-06-20,8,
4,2016-06-20,9,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8996951 entries, 0 to 8996950
Data columns (total 3 columns):
 #   Column                                 Dtype  
---  ------                                 -----  
 0   date                                   object 
 1   security_code                          int64  
 2   early_930_1030_net_order_volume_ratio  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 205.9+ MB


In [4]:
def split_exchanges(df):
    """
    将股票数据按交易所分为上交所和深交所两个DataFrame
    
    参数:
    df (pandas.DataFrame): 包含股票数据的DataFrame，必须有security_code列
    
    返回:
    tuple: (上交所DataFrame, 深交所DataFrame)
    """

    df['security_code'] = df['security_code'].astype(int)

    # 创建上交所的掩码
    # 上交所股票代码规则：以6开头(600000-699999)，包括科创板(688xxx)和其他6开头股票
    sh_mask = (df['security_code'] >= 600000) & (df['security_code'] <= 699999)
    
    # 创建深交所的掩码
    # 深交所股票代码规则：
    # 主板：000开头(000001-000999)和001开头
    # 中小板：002开头
    # 创业板：300开头
    # 其他深市主板：003开头
    sz_mask_main = (df['security_code'] >= 0) & (df['security_code'] <= 1999)  # 000xxx和001xxx
    sz_mask_sme = (df['security_code'] >= 2000) & (df['security_code'] <= 2999)  # 002xxx
    sz_mask_other = (df['security_code'] >= 3000) & (df['security_code'] <= 3999)  # 003xxx
    sz_mask_gem = (df['security_code'] >= 300000) & (df['security_code'] <= 399999)  # 300xxx
    
    # 合并所有深交所掩码
    sz_mask = sz_mask_main | sz_mask_sme | sz_mask_other | sz_mask_gem
    
    # 使用掩码分割数据
    sh_df = df[sh_mask].copy()
    sz_df = df[sz_mask].copy()
    
    # 输出分割结果的统计信息
    print(f"原始数据行数: {len(df)}")
    print(f"上交所数据行数: {len(sh_df)}")
    print(f"深交所数据行数: {len(sz_df)}")
    print(f"分类后总行数: {len(sh_df) + len(sz_df)}")
    
    # 检查是否有未分类的数据
    unclassified = len(df) - len(sh_df) - len(sz_df)
    if unclassified > 0:
        print(f"警告: 有 {unclassified} 行数据未被分类")
        # 可以查看未分类的股票代码
        unclassified_codes = df[~(sh_mask | sz_mask)]['security_code'].unique()
        print(f"未分类的股票代码: {unclassified_codes}")
    
    return sh_df, sz_df



In [6]:
sh_df, sz_df = split_exchanges(df)

file_path = "/data/home/lexuanchen/Factors/Order/Signal/Improved_Early_Order_Size_Ratio_3_2_ListSector"
os.makedirs(file_path, exist_ok=True)

sh_df.to_csv(f"{file_path}/sh_early_net_order_volume_ratio.csv")
sz_df.to_csv(f"{file_path}/sz_early_net_order_volume_ratio.csv")

原始数据行数: 8996951
上交所数据行数: 3855856
深交所数据行数: 5141095
分类后总行数: 8996951


In [5]:
order_pth = "/data/cephfs/order/20160831.parquet"

order_df = pd.read_parquet(order_pth)

sh_order, sz_order = split_exchanges(order_df)

print(sh_order.head())

print(sz_order.head())

原始数据行数: 41574039
上交所数据行数: 16236469
深交所数据行数: 25337570
分类后总行数: 41574039
          security_code trading_day  exchange_code  order_time  appl_seq_num  \
25337570         600486  2016-08-31              1    91500550       27216.0   
25337571         600486  2016-08-31              1    91500650       27217.0   
25337572         600486  2016-08-31              1    91500650       27218.0   
25337573         600486  2016-08-31              1    91500650       27219.0   
25337574         600486  2016-08-31              1    91500710       27220.0   

          order_side order_type order_details  order_no order_price  \
25337570          -1          A             L      1212       32.45   
25337571          -1          A             L      1654       32.58   
25337572          -1          A             L      1655       32.88   
25337573          -1          A             L      1656       32.95   
25337574          -1          A             L      2180       33.00   

         order_price_a

In [6]:
sh_order = sh_order.sort_values(by = ['security_code','order_time'])

sz_order = sz_order.sort_values(by = ['security_code','order_time'])

In [8]:
sh_order.head(25)

Unnamed: 0,security_code,trading_day,exchange_code,order_time,appl_seq_num,order_side,order_type,order_details,order_no,order_price,order_price_adj,order_volume,op_type
40306856,600000,2016-08-31,1,91500040,78216.0,-1,A,L,364,16.49,16.49,6700,0
40306857,600000,2016-08-31,1,91500070,78217.0,-1,A,L,407,16.45,16.45,28000,0
40306858,600000,2016-08-31,1,91500170,78218.0,-1,A,L,562,16.49,16.49,1600,0
40306859,600000,2016-08-31,1,91500220,78219.0,1,A,L,645,15.98,15.98,2000,0
40306860,600000,2016-08-31,1,91500290,78220.0,-1,A,L,732,16.49,16.49,4000,0
40306861,600000,2016-08-31,1,91500290,78221.0,-1,A,L,733,16.49,16.49,1500,0
40306862,600000,2016-08-31,1,91500290,78222.0,-1,A,L,734,16.49,16.49,900,0
40306863,600000,2016-08-31,1,91500290,78223.0,-1,A,L,735,16.49,16.49,1500,0
40306864,600000,2016-08-31,1,91500290,78224.0,-1,A,L,736,16.49,16.49,800,0
40306865,600000,2016-08-31,1,91500290,78225.0,-1,A,L,737,16.49,16.49,2300,0


In [9]:
sz_order.head(25)

Unnamed: 0,security_code,trading_day,exchange_code,order_time,appl_seq_num,order_side,order_type,order_details,order_no,order_price,order_price_adj,order_volume,op_type
6700921,1,2016-08-31,2,91500000,220.0,-1,A,L,220,9.51,9.51,999600,0
6700922,1,2016-08-31,2,91500040,526.0,-1,A,L,526,10.1,10.1,500,0
6700923,1,2016-08-31,2,91500050,559.0,1,A,L,559,9.1,9.1,800,0
6700924,1,2016-08-31,2,91500080,782.0,1,A,L,782,9.32,9.32,400,0
6700925,1,2016-08-31,2,91500130,1077.0,1,A,L,1077,9.21,9.21,500,0
6700926,1,2016-08-31,2,91500140,1084.0,-1,A,L,1084,9.7,9.7,2500,0
6700927,1,2016-08-31,2,91500170,1175.0,-1,A,L,1175,10.2,10.2,600,0
6700928,1,2016-08-31,2,91500170,1178.0,1,A,L,1178,9.0,9.0,1000,0
6700929,1,2016-08-31,2,91500180,1198.0,-1,A,L,1198,9.8,9.8,2500,0
6700930,1,2016-08-31,2,91500200,1467.0,-1,A,L,1467,10.2,10.2,1000,0
