In [7]:
### 从2024年股票数据中获取所有股票的代码

import os
import json

# Define the updated function to modify the filenames as required
def get_ticker_with_prefix(path):
    tickers = []
    for f in os.listdir(path):
        if f.endswith('.csv'):
            # Extract numeric part
            numeric_part = ''.join(filter(str.isdigit, f))
            # Extract the letter part (.SZ or .SH), convert to lowercase and prepend to the numeric part
            letter_part = f.split('.')[1].lower()
            ticker = letter_part + numeric_part
            tickers.append(ticker)
    # Sort the list of IDs
    return sorted(tickers)

# Replace 'path_to_folder' with the actual path to your folder containing the CSV files
path_to_folder = 'data/2024'

# Get the modified stock ids
tickers_with_prefix = get_ticker_with_prefix(path_to_folder)

# Convert the list of modified stock ids to JSON format
json_content = json.dumps(tickers_with_prefix, indent=4)

# Replace 'path_to_json_file' with the actual path where you want to save the JSON file
path_to_json_file = 'data/ticker.json'

# Write the JSON content to a file
with open(path_to_json_file, 'w') as json_file:
    json_file.write(json_content)

In [8]:
### 从 akshare 获取后复权因子数据，并保存在 CSV 文件中

import akshare as ak
import json
import os

# 读取股票代码
path_to_json_file = 'data/ticker.json'
with open(path_to_json_file, 'r') as json_file:
    tickers = json.load(json_file)

# 检查并获取后复权因子
for ticker in tickers:
    filename = f"{ticker}.csv"

    # 检查文件是否存在
    if not os.path.exists("data/backward_adjust_factor/" + filename):
        try:
            # 尝试调用 API 获取数据
            df = ak.stock_zh_a_daily(symbol=ticker, adjust="hfq-factor")
            # 保存数据到 CSV 文件
            df.to_csv("data/backward_adjust_factor/" + filename, index=False)
            print(f"{ticker} data saved to {filename}")
        except Exception as e:
            # 打印错误信息，并继续处理下一个 ticker
            print(f"Error retrieving data for {ticker}: {e}")
    else:
        print(f"File {filename} already exists. Skipping.")

ModuleNotFoundError: No module named 'akshare'

In [None]:
### 处理股票tickers，去掉数字后面的.和字母

import json
import re

# 假设你的json文件名为'top_market_cap_stocks.json'
with open('data/top_market_cap_stocks.json', 'r') as f:
    data = json.load(f)

# 遍历json文件中的所有字符串
for i in range(len(data)):
    # 使用正则表达式去掉数字后面的.和字母
    data[i] = re.sub(r'(\d)\.[a-zA-Z]*', r'\1', data[i])

data = sorted(data)

# 将修改后的数据写回json文件
with open('data/top_market_cap_stocks.json', 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
# ### 使用价格数据后复权后计算股票收益率
# ### 已弃用

# import pandas as pd
# from tqdm import tqdm

# # 假设股票价格数据的文件路径
# stock_prices_path = 'data/merged_data.csv'
# # 假设后复权因子文件夹的路径
# adjust_factor_path = 'data/backward_adjust_factor/'

# # 分块大小
# chunksize = 10**5  

# # 读取股票价格数据的行数来计算总块数
# total_rows = sum(1 for row in open(stock_prices_path, 'r'))
# total_chunks = total_rows // chunksize + (total_rows % chunksize > 0)

# # 初始化存储后复权价格的DataFrame
# adjusted_prices = None

# # 使用tqdm显示进度条
# with tqdm(total=total_chunks, desc="Processing chunks") as pbar:
#     for chunk in pd.read_csv(stock_prices_path, chunksize=chunksize):
#         chunk['trade_time'] = pd.to_datetime(chunk['trade_time'])
#         if adjusted_prices is None:
#             adjusted_prices = pd.DataFrame()
#             adjusted_prices['trade_time'] = chunk['trade_time']

#         # 对于每个股票代码
#         for stock_code in chunk.columns[1:]:
#             try:
#                 # 尝试加载后复权因子文件
#                 hfq_factor_file = f'{adjust_factor_path}sh{stock_code}.csv'
#                 hfq_factors = pd.read_csv(hfq_factor_file)
#             except FileNotFoundError:
#                 try:
#                     hfq_factor_file = f'{adjust_factor_path}sz{stock_code}.csv'
#                     hfq_factors = pd.read_csv(hfq_factor_file)
#                 except FileNotFoundError:
#                     # 如果都找不到，打印信息并跳过该股票
#                     print(f"未找到股票 {stock_code} 的后复权因子文件")
#                     continue

#             hfq_factors['date'] = pd.to_datetime(hfq_factors['date'])
#             hfq_factors.sort_values('date', inplace=True)
#             hfq_factors.set_index('date', inplace=True)

#             # 计算后复权价格
#             adjusted_chunk = chunk.apply(
#                 lambda row: row[stock_code] * hfq_factors.loc[:row['trade_time']].iloc[-1]['hfq_factor'],
#                 axis=1
#             )
#             adjusted_prices[stock_code] = adjusted_chunk

#         pbar.update(1)  # 更新进度条

# # 设置交易时间为索引
# adjusted_prices.set_index('trade_time', inplace=True)

# # 保存结果
# adjusted_prices.to_csv('data/stock_prices_hfq.csv', index=True)
# print('data/stock_prices_hfq.csv saved')

# # 计算收益率
# returns = adjusted_prices.pct_change()
# print('returns calculated')

# # 移除收益率DataFrame中的第一行，因为它将会是NaN（第一个点没有前一个点来计算收益率）
# returns = returns.iloc[1:]

# # 将收益率DataFrame保存为CSV文件
# returns.to_csv('data/stock_returns.csv', index=True)
# print('data/stock_returns.csv saved')

# # 将收益率DataFrame保存为序列化的二进制文件（Pickle格式）
# returns.to_pickle('data/stock_returns.pkl')
# print('data/stock_returns.pkl saved')


In [None]:
### 处理收益率 - 测试

import pandas as pd
import os
import datetime

# 设定路径
#path_to_stock_returns = 'data/stock_returns_test.csv'
path_to_stock_returns = 'data/stock_returns.csv'

path_to_hfq_data_folder = 'data/backward_adjust_factor/'



# 读取股票回报率数据
stock_returns = pd.read_csv(path_to_stock_returns)
stock_returns['trade_time'] = pd.to_datetime(stock_returns['trade_time'])
stock_returns.set_index('trade_time', inplace=True)

# 遍历 stock_returns 的列
for ticker in stock_returns.columns:  # 包括所有列
    file_found = False
    # 尝试找到hfq_data_path中sh+ticker.csv或者sz+ticker.csv文件
    for prefix in ['sh', 'sz']:
        filename = f'{prefix}{ticker}.csv'
        file_path = os.path.join(path_to_hfq_data_folder, filename)
        if os.path.exists(file_path):
            hfq_data = pd.read_csv(file_path)
            hfq_data['date'] = pd.to_datetime(hfq_data['date'])
            file_found = True
            break

    if file_found:
        # 遍历hfq_data中日期在2000年1月1日以后的数据
        for index, row in hfq_data[hfq_data['date'] > '2000-01-01'].iterrows():
            # 在日期后增加09:31:00的时间
            trade_time = row['date'] + datetime.timedelta(hours=9, minutes=31)
            # 处理对应的收益率数据
            if trade_time in stock_returns.index:
                stock_returns.at[trade_time, ticker] = (stock_returns.at[trade_time, ticker] + 1) * row['hfq_one_point'] - 1
    else:
        print(f'No hfq_one_point data file found for ticker {ticker}. Skipping...')

# 保存处理后的数据
#stock_returns.to_csv('data/adjusted_return_test.csv')
stock_returns.to_csv('data/adjusted_return.csv')

In [None]:
### 处理收益率

import pandas as pd
import os
import datetime
from tqdm import tqdm  # 引入tqdm库

# 设定路径
path_to_stock_returns = 'data/stock_returns.csv'
path_to_hfq_data_folder = 'data/backward_adjust_factor/'

# 读取股票回报率数据
stock_returns = pd.read_csv(path_to_stock_returns)
stock_returns['trade_time'] = pd.to_datetime(stock_returns['trade_time'])
stock_returns.set_index('trade_time', inplace=True)

# 使用tqdm创建进度条
for ticker in tqdm(stock_returns.columns, desc="Processing tickers"):  # 包括所有列，并添加进度条
    file_found = False
    # 尝试找到hfq_data_path中sh+ticker.csv或者sz+ticker.csv文件
    for prefix in ['sh', 'sz']:
        filename = f'{prefix}{ticker}.csv'
        file_path = os.path.join(path_to_hfq_data_folder, filename)
        if os.path.exists(file_path):
            hfq_data = pd.read_csv(file_path)
            hfq_data['date'] = pd.to_datetime(hfq_data['date'])
            file_found = True
            break

    if file_found:
        # 遍历hfq_data中日期在2000年1月1日以后的数据
        for index, row in hfq_data[hfq_data['date'] > '2000-01-01'].iterrows():
            # 在日期后增加09:31:00的时间
            trade_time = row['date'] + datetime.timedelta(hours=9, minutes=31)
            # 处理对应的收益率数据
            if trade_time in stock_returns.index:
                stock_returns.at[trade_time, ticker] = (stock_returns.at[trade_time, ticker] + 1) * row['hfq_one_point'] - 1
    else:
        print(f'No hfq_one_point data file found for ticker {ticker}. Skipping...')

# 保存处理后的数据
stock_returns.to_csv('data/adjusted_return.csv')
stock_returns.to_pickle('data/stock_returns.pkl')


Processing tickers:  13%|█▎        | 90/696 [00:02<00:11, 51.94it/s]

No hfq_one_point data file found for ticker 001696. Skipping...
No hfq_one_point data file found for ticker 001872. Skipping...


Processing tickers:  24%|██▎       | 164/696 [00:03<00:12, 43.10it/s]

No hfq_one_point data file found for ticker 001914. Skipping...


Processing tickers: 100%|██████████| 696/696 [00:05<00:00, 117.20it/s]


No hfq_one_point data file found for ticker 001896. Skipping...


In [None]:
### 合并股票日线数据

import os
import pandas as pd
import json

def merge_stock_data(source_directory, output_file_path, top_stocks_file):
    with open(top_stocks_file, 'r', encoding='utf-8') as file:
        tickers = json.load(file)  
    
    merged_data = None
    cutoff_date = pd.to_datetime('1999-01-01').date()  # 转换为 date 类型

    for file in os.listdir(source_directory):
        if file.endswith(".csv"):
            stock_code = file.split('.')[0]  # 假设文件名格式为 '000001.SZ.csv'
            stock_code_numeric = ''.join(filter(str.isdigit, stock_code))  # 提取数字部分
            if stock_code_numeric in tickers:
                df = pd.read_csv(
                    os.path.join(source_directory, file),
                    encoding='gbk',
                    parse_dates=['交易时间'],
                    date_parser=lambda x: pd.to_datetime(x, format='%Y%m%d').date()
                )
                df['交易时间'] = df['交易时间'].dt.date  # 转换为 date 类型
                df = df[df['交易时间'] >= cutoff_date]
                df = df[['交易时间', '收盘价']]
                df.rename(columns={'收盘价': stock_code}, inplace=True)

                if merged_data is None:
                    merged_data = df
                else:
                    merged_data = pd.merge_ordered(merged_data, df, on='交易时间', how='outer', fill_method='ffill')

    if merged_data is not None:
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
        merged_data.to_csv(output_file_path, index=False)
    else:
        print("没有可用的数据进行合并。")

# 使用示例：
source_directory = 'data/stocks_daily_data/'
output_file_path = 'data/merged_daily_stock_data.csv'
top_stocks_file = 'data/top_market_cap_stocks.json'
merge_stock_data(source_directory, output_file_path, top_stocks_file)


In [2]:
### 将日线收盘价转为收益率

import pandas as pd

stock_prices_path = 'data/merged_daily_stock_data.csv'

stock_data = pd.read_csv(stock_prices_path)
stock_data.set_index('trade_date', inplace=True)

# 计算收益率
stock_returns = stock_data.pct_change()
stock_returns = stock_returns.iloc[1:]  # 去掉第一行
stock_returns.to_csv('data/stock_returns_daily.csv', index=True)

In [3]:
### 使用后复权因子调整股票日线的收益率

import pandas as pd
import os
import datetime

# 设定路径
path_to_stock_returns = 'data/stock_returns_daily.csv'
path_to_hfq_data_folder = 'data/backward_adjust_factor/'

# 读取股票回报率数据
stock_returns = pd.read_csv(path_to_stock_returns)
stock_returns['trade_date'] = pd.to_datetime(stock_returns['trade_date'])
stock_returns.set_index('trade_date', inplace=True)


for ticker in stock_returns.columns:  # 包括所有列
    file_found = False
    # 尝试找到hfq_data_path中sh+ticker.csv或者sz+ticker.csv文件
    for prefix in ['sh', 'sz']:
        filename = f'{prefix}{ticker}.csv'
        file_path = os.path.join(path_to_hfq_data_folder, filename)
        if os.path.exists(file_path):
            hfq_data = pd.read_csv(file_path)
            hfq_data['date'] = pd.to_datetime(hfq_data['date'])
            file_found = True
            break

    if file_found:
        # 遍历hfq_data中日期在2000年1月1日以后的数据
        for index, row in hfq_data[hfq_data['date'] > '1999-01-01'].iterrows():
            trade_time = row['date']
            # 处理对应的收益率数据
            if trade_time in stock_returns.index:
                stock_returns.at[trade_time, ticker] = (stock_returns.at[trade_time, ticker] + 1) * row['hfq_one_point'] - 1
    else:
        print(f'No hfq_one_point data file found for ticker {ticker}. Skipping...')


# 保存处理后的数据
stock_returns.to_csv('data/adjusted_returns_daily.csv')
stock_returns.to_pickle('data/stock_returns_daily.pkl')


No hfq_one_point data file found for ticker 001696. Skipping...
No hfq_one_point data file found for ticker 001872. Skipping...
No hfq_one_point data file found for ticker 001896. Skipping...
No hfq_one_point data file found for ticker 001914. Skipping...


In [1]:
### 解释屹洲的数据问题
import pandas as pd

# 读取CSV文件
file_path = 'data/stock_returns.csv'  # 这里填入文件的路径
selected_columns = ['trade_time', '600733']  # 您希望加载的列

data = pd.read_csv(file_path, usecols=selected_columns, parse_dates=['trade_time'])
data.set_index('trade_time', inplace=True)
sample_test = data['600733'].loc['2023-11-30']
#sample_test = sample_test.iloc[1:]

print(f"mean = {sample_test.mean()}")
print(f"std = {sample_test.std()}")
print(f"len = {len(sample_test)}")

sample_test


mean = -1.7039907779586318e-05
std = 0.0020668156696454104
len = 241


trade_time
2023-11-30 09:31:00   -0.003072
2023-11-30 09:32:00   -0.003082
2023-11-30 09:33:00    0.000000
2023-11-30 09:34:00    0.006182
2023-11-30 09:35:00   -0.001536
                         ...   
2023-11-30 14:56:00   -0.001550
2023-11-30 14:57:00   -0.001553
2023-11-30 14:58:00    0.000000
2023-11-30 14:59:00    0.000000
2023-11-30 15:00:00    0.007776
Name: 600733, Length: 241, dtype: float64

In [18]:
### 计算数据中是0的比例

import pandas as pd

file_path = 'data/stock_returns_5m.csv'  # 这里填入文件的路径
#cols_to_use = list(range(100))

#df = pd.read_csv(file_path, usecols=cols_to_use)
df = pd.read_csv(file_path)
df.set_index('trade_time', inplace=True)
zeros = (df == 0).sum().sum()
total_elements = df.size
ratio = zeros / total_elements

print(f"Total number of 0s: {zeros}")
print(f"Total number of elements: {total_elements}")
print(f"Ratio: {ratio}")

# 初始化一个空列表来存储每列的0比例
zero_ratios = []

# 遍历每一列
for col in df.columns:
    # 计算每列的0数量
    zero_count = df[col].eq(0).sum()
    # 计算每列的0比例
    zero_ratio = zero_count / len(df[col])
    # 将比例添加到列表中
    zero_ratios.append(zero_ratio)

print(zero_ratios)


Total number of 0s: 56650802
Total number of elements: 194601600
Ratio: 0.2911116969233552
[0.36293633762517885, 0.3264842632331903, 0.18104792560801145, 0.2736409155937053, 0.38564020028612306, 0.28023247496423465, 0.25912374821173106, 0.18853719599427754, 0.19012160228898425, 0.3148068669527897, 0.2882725321888412, 0.36507510729613735, 0.276777539341917, 0.2648140200286123, 0.2719599427753934, 0.2713090128755365, 0.3057010014306152, 0.2799892703862661, 0.2526144492131617, 0.39606938483547927, 0.370225321888412, 0.21243204577968527, 0.37894134477825464, 0.360379113018598, 0.3320422031473534, 0.2840236051502146, 0.2969098712446352, 0.2662696709585122, 0.22855507868383404, 0.20133404864091559, 0.2019134477825465, 0.4369170243204578, 0.3610550786838341, 0.4320994277539342, 0.2579685264663805, 0.3624928469241774, 0.22968884120171673, 0.18221745350500715, 0.3376001430615165, 0.23331545064377682, 0.19611230329041487, 0.27108726752503576, 0.2759942775393419, 0.305, 0.3087732474964235, 0.2774

In [14]:
### 生成5min数据

import pandas as pd
from datetime import datetime

# 读取CSV文件
def filter_csv(input_file, output_file):
    try:
        df = pd.read_csv(input_file)

        # 确保 'trade_time' 列是字符串类型
        df['trade_time'] = df['trade_time'].astype(str)

        # 筛选出分钟部分是5的倍数的行
        df_filtered = df[df['trade_time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').minute % 5 == 0)]

        # 保存到新的CSV文件
        df_filtered.to_csv(output_file, index=False)
        print(f"Filtered data saved to {output_file}")

    except Exception as e:
        print(f"An error occurred: {e}")

# 使用示例
filter_csv('data/merged_data.csv', 'data/merged_5m_stock_data.csv')


Filtered data saved to data/merged_5m__stock_data.csv


In [15]:
### 将5m线收盘价转为收益率

import pandas as pd

stock_prices_path = 'data/merged_5m_stock_data.csv'

stock_data = pd.read_csv(stock_prices_path)
stock_data.set_index('trade_time', inplace=True)

# 计算收益率
stock_returns = stock_data.pct_change()
stock_returns = stock_returns.iloc[1:]  # 去掉第一行
stock_returns.to_csv('data/stock_returns_5m.csv', index=True)

In [16]:
### 处理5m线收益率

import pandas as pd
import os
import datetime
from tqdm import tqdm  # 引入tqdm库

# 设定路径
path_to_stock_returns = 'data/stock_returns_5m.csv'
path_to_hfq_data_folder = 'data/backward_adjust_factor/'

# 读取股票回报率数据
stock_returns = pd.read_csv(path_to_stock_returns)
stock_returns['trade_time'] = pd.to_datetime(stock_returns['trade_time'])
stock_returns.set_index('trade_time', inplace=True)

# 使用tqdm创建进度条
for ticker in tqdm(stock_returns.columns, desc="Processing tickers"):  # 包括所有列，并添加进度条
    file_found = False
    # 尝试找到hfq_data_path中sh+ticker.csv或者sz+ticker.csv文件
    for prefix in ['sh', 'sz']:
        filename = f'{prefix}{ticker}.csv'
        file_path = os.path.join(path_to_hfq_data_folder, filename)
        if os.path.exists(file_path):
            hfq_data = pd.read_csv(file_path)
            hfq_data['date'] = pd.to_datetime(hfq_data['date'])
            file_found = True
            break

    if file_found:
        # 遍历hfq_data中日期在2000年1月1日以后的数据
        for index, row in hfq_data[hfq_data['date'] > '2000-01-01'].iterrows():
            # 在日期后增加09:35:00的时间
            trade_time = row['date'] + datetime.timedelta(hours=9, minutes=35)
            # 处理对应的收益率数据
            if trade_time in stock_returns.index:
                stock_returns.at[trade_time, ticker] = (stock_returns.at[trade_time, ticker] + 1) * row['hfq_one_point'] - 1
    else:
        print(f'No hfq_one_point data file found for ticker {ticker}. Skipping...')

# 保存处理后的数据
stock_returns.to_csv('data/adjusted_return_5m.csv')
stock_returns.to_pickle('data/stock_returns_5m.pkl')

Processing tickers:  24%|██▎       | 165/696 [00:00<00:00, 576.02it/s]

No hfq_one_point data file found for ticker 001696. Skipping...
No hfq_one_point data file found for ticker 001872. Skipping...
No hfq_one_point data file found for ticker 001914. Skipping...


Processing tickers: 100%|██████████| 696/696 [00:01<00:00, 672.02it/s]


No hfq_one_point data file found for ticker 001896. Skipping...
