### 计算每个股票的30min总成交额

In [1]:
import pandas as pd
import os
import threading
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor,as_completed

def process_stock_data(file_name, input_folder, output_folder):
    # 检查输出文件是否已存在
    output_file_path = os.path.join(output_folder, file_name)
    if os.path.exists(output_file_path):
        return
    
    # 读取数据
    df = pd.read_csv(os.path.join(input_folder, file_name))
    
    # 按30行分组并处理数据
    result = []
    for i in range(0, len(df), 30):
        batch = df.iloc[i:i+30]
        if not batch.empty:
            total_amount = batch['amount'].sum()
            timestamp = batch.iloc[-1]['trade_time']
            result.append([timestamp, total_amount])
    
    # 保存到新的DataFrame中
    processed_df = pd.DataFrame(result, columns=['trade_time', 'total_amount'])
    
    # 写入到新文件中
    processed_df.to_csv(os.path.join(output_folder, file_name), index=False)

def main(input_folder, output_folder):
    files = os.listdir(input_folder)

    # 确保输出文件夹存在
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # 使用线程池执行任务
    with ThreadPoolExecutor(max_workers=os.cpu_count() - 5) as executor:
        futures = [executor.submit(process_stock_data, file, input_folder, output_folder) for file in files]
        
        # 使用tqdm显示进度条
        for future in tqdm(as_completed(futures), total=len(files)):
            future.result()

if __name__ == "__main__":
    input_folder = 'data/by_stock_merged'  # 修改为您的输入文件夹路径
    output_folder = 'data/v30/by_stock_v30'  # 修改为您的输出文件夹路径
    main(input_folder, output_folder)


 28%|██▊       | 1427/5100 [1:58:24<7:40:53,  7.53s/it] 

### 计算交易额强度

In [2]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def process_file(file_path):
    # 读取文件
    df = pd.read_csv(file_path)

    # 提取日期和时间
    df['date'] = pd.to_datetime(df['trade_time']).dt.date
    df['time'] = pd.to_datetime(df['trade_time']).dt.time

    # 计算10天平均值（不包括当天）
    df['v_avg30'] = df.groupby('time')['total_amount'].apply(
        lambda x: x.shift().rolling(window=10, min_periods=1).mean()).reset_index(level=0, drop=True)

    # 第一个交易日的数据置0
    first_day = df['date'].min()
    df.loc[df['date'] == first_day, 'v_avg30'] = 0

    # 添加v_str30列
    df['v_str30'] = df['total_amount'] / df['v_avg30']
    df.loc[df['date'] == first_day, 'v_str30'] = 0

    # 删除辅助列
    df.drop(columns=['date', 'time'], inplace=True)

    # 保存结果
    output_file_path = os.path.join(output_folder, os.path.basename(file_path))
    df.to_csv(output_file_path, index=False)

def process_files_in_folder(folder_path, output_folder):
    files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

    # 使用CPU核心数-1的线程
    num_threads = max(1, os.cpu_count() - 1)

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        list(tqdm(executor.map(process_file, files), total=len(files)))

# 要处理的文件夹路径
folder_path = 'data/v30/by_stock_v30' # 替换为您的数据文件夹路径
output_folder = 'data/v30/by_stock_v_str30' # 替换为您的输出文件夹路径

# 确保输出文件夹存在
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# 处理文件夹中的文件
process_files_in_folder(folder_path, output_folder)



100%|██████████| 5100/5100 [10:01<00:00,  8.49it/s]


### 合并数据

In [5]:
import pandas as pd
import os
from tqdm import tqdm  # 导入tqdm

def merge_column(folder, column_name, output_file):
    merged_df = None

    files = os.listdir(folder)  # 获取文件列表
    for file in tqdm(files):  # 使用tqdm包裹文件列表以显示进度
        file_path = os.path.join(folder, file)
        if file.endswith('.csv'):
            ticker = os.path.splitext(file)[0]
            df = pd.read_csv(file_path)

            # 只保留 trade_time 和特定的列
            df = df[['trade_time', column_name]]
            df.rename(columns={column_name: ticker}, inplace=True)

            if merged_df is None:
                merged_df = df
            else:
                merged_df = pd.merge_ordered(merged_df, df, on='trade_time')

    merged_df.to_csv(output_file, index=False)

output_folder = 'data/v30/by_stock_v_str30'  # 已处理数据的输出文件夹路径

# 分别为 total_amount, v_avg30 和 v_str30 合并数据
merge_column(output_folder, 'total_amount', 'data/v30/merged_amount.csv')
merge_column(output_folder, 'v_avg30', 'data/v30/merged_avg.csv')
merge_column(output_folder, 'v_str30', 'data/v30/merged_str.csv')

print("所有数据合并完成")


100%|██████████| 5100/5100 [23:45<00:00,  3.58it/s]
 97%|█████████▋| 4933/5100 [22:29<01:19,  2.09it/s]

### 转csv

In [None]:
import pandas as pd
import os
folder_path = 'data/v30'
for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        pkl_file_path = os.path.join(folder_path, os.path.splitext(file)[0] + '.pkl')
        df.to_pickle(
            pkl_file_path, 
            #compression='zip'
            )

print("所有文件已转换完成。")