In [None]:
### 从2024年股票数据中获取所有股票的代码

import os
import json

# Define the updated function to modify the filenames as required
def get_ticker_with_prefix(path):
    tickers = []
    for f in os.listdir(path):
        if f.endswith('.csv'):
            # Extract numeric part
            numeric_part = ''.join(filter(str.isdigit, f))
            # Extract the letter part (.SZ or .SH), convert to lowercase and prepend to the numeric part
            letter_part = f.split('.')[1].lower()
            ticker = letter_part + numeric_part
            tickers.append(ticker)
    # Sort the list of IDs
    return sorted(tickers)

# Replace 'path_to_folder' with the actual path to your folder containing the CSV files
path_to_folder = 'data/2024'

# Get the modified stock ids
tickers_with_prefix = get_ticker_with_prefix(path_to_folder)

# Convert the list of modified stock ids to JSON format
json_content = json.dumps(tickers_with_prefix, indent=4)

# Replace 'path_to_json_file' with the actual path where you want to save the JSON file
path_to_json_file = 'data/ticker.json'

# Write the JSON content to a file
with open(path_to_json_file, 'w') as json_file:
    json_file.write(json_content)

In [None]:
### 从 akshare 获取后复权因子数据，并保存在 CSV 文件中

import akshare as ak
import json
import os

# 读取股票代码
path_to_json_file = 'data/ticker.json'
with open(path_to_json_file, 'r') as json_file:
    tickers = json.load(json_file)

# 检查并获取后复权因子
for ticker in tickers:
    filename = f"{ticker}.csv"

    # 检查文件是否存在
    if not os.path.exists("data/backward_adjust_factor/" + filename):
        try:
            # 尝试调用 API 获取数据
            df = ak.stock_zh_a_daily(symbol=ticker, adjust="hfq-factor")
            # 保存数据到 CSV 文件
            df.to_csv("data/backward_adjust_factor/" + filename, index=False)
            print(f"{ticker} data saved to {filename}")
        except Exception as e:
            # 打印错误信息，并继续处理下一个 ticker
            print(f"Error retrieving data for {ticker}: {e}")
    else:
        print(f"File {filename} already exists. Skipping.")

In [None]:
### 处理股票tickers，去掉数字后面的.和字母

import json
import re

# 假设你的json文件名为'top_market_cap_stocks.json'
with open('data/top_market_cap_stocks.json', 'r') as f:
    data = json.load(f)

# 遍历json文件中的所有字符串
for i in range(len(data)):
    # 使用正则表达式去掉数字后面的.和字母
    data[i] = re.sub(r'(\d)\.[a-zA-Z]*', r'\1', data[i])

data = sorted(data)

# 将修改后的数据写回json文件
with open('data/top_market_cap_stocks.json', 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
### 使用价格数据后复权后计算股票收益率
### 已弃用

import pandas as pd
from tqdm import tqdm

# 假设股票价格数据的文件路径
stock_prices_path = 'data/merged_data.csv'
# 假设后复权因子文件夹的路径
adjust_factor_path = 'data/backward_adjust_factor/'

# 分块大小
chunksize = 10**5  

# 读取股票价格数据的行数来计算总块数
total_rows = sum(1 for row in open(stock_prices_path, 'r'))
total_chunks = total_rows // chunksize + (total_rows % chunksize > 0)

# 初始化存储后复权价格的DataFrame
adjusted_prices = None

# 使用tqdm显示进度条
with tqdm(total=total_chunks, desc="Processing chunks") as pbar:
    for chunk in pd.read_csv(stock_prices_path, chunksize=chunksize):
        chunk['trade_time'] = pd.to_datetime(chunk['trade_time'])
        if adjusted_prices is None:
            adjusted_prices = pd.DataFrame()
            adjusted_prices['trade_time'] = chunk['trade_time']

        # 对于每个股票代码
        for stock_code in chunk.columns[1:]:
            try:
                # 尝试加载后复权因子文件
                hfq_factor_file = f'{adjust_factor_path}sh{stock_code}.csv'
                hfq_factors = pd.read_csv(hfq_factor_file)
            except FileNotFoundError:
                try:
                    hfq_factor_file = f'{adjust_factor_path}sz{stock_code}.csv'
                    hfq_factors = pd.read_csv(hfq_factor_file)
                except FileNotFoundError:
                    # 如果都找不到，打印信息并跳过该股票
                    print(f"未找到股票 {stock_code} 的后复权因子文件")
                    continue

            hfq_factors['date'] = pd.to_datetime(hfq_factors['date'])
            hfq_factors.sort_values('date', inplace=True)
            hfq_factors.set_index('date', inplace=True)

            # 计算后复权价格
            adjusted_chunk = chunk.apply(
                lambda row: row[stock_code] * hfq_factors.loc[:row['trade_time']].iloc[-1]['hfq_factor'],
                axis=1
            )
            adjusted_prices[stock_code] = adjusted_chunk

        pbar.update(1)  # 更新进度条

# 设置交易时间为索引
adjusted_prices.set_index('trade_time', inplace=True)

# 保存结果
adjusted_prices.to_csv('data/stock_prices_hfq.csv', index=True)
print('data/stock_prices_hfq.csv saved')

# 计算收益率
returns = adjusted_prices.pct_change()
print('returns calculated')

# 移除收益率DataFrame中的第一行，因为它将会是NaN（第一个点没有前一个点来计算收益率）
returns = returns.iloc[1:]

# 将收益率DataFrame保存为CSV文件
returns.to_csv('data/stock_returns.csv', index=True)
print('data/stock_returns.csv saved')

# 将收益率DataFrame保存为序列化的二进制文件（Pickle格式）
returns.to_pickle('data/stock_returns.pkl')
print('data/stock_returns.pkl saved')
