### 获取所有股票代码

In [None]:
import os
import json

# Define the updated function to modify the filenames as required
def get_ticker_with_prefix(path):
    tickers = []
    for f in os.listdir(path):
        if f.endswith('.csv'):
            # Extract numeric part
            numeric_part = ''.join(filter(str.isdigit, f))
            # Extract the letter part (.SZ or .SH), convert to lowercase and prepend to the numeric part
            letter_part = f.split('.')[1].lower()
            #ticker = letter_part + numeric_part
            ticker = numeric_part
            tickers.append(ticker)
    # Sort the list of IDs
    return sorted(tickers)

# Replace 'path_to_folder' with the actual path to your folder containing the CSV files
path_to_folder = 'data/raw-data/2024'

# Get the modified stock ids
tickers_with_prefix = get_ticker_with_prefix(path_to_folder)

# Convert the list of modified stock ids to JSON format
json_content = json.dumps(tickers_with_prefix, indent=4)

# Replace 'path_to_json_file' with the actual path where you want to save the JSON file
path_to_json_file = 'data/tickers.json'

# Write the JSON content to a file
with open(path_to_json_file, 'w') as json_file:
    json_file.write(json_content)

### 按股票合并历年数据<多线程>

In [None]:
import os
import pandas as pd
import json
import concurrent.futures
import os

# 路径设置
json_file = 'data/top_cap_tickers.json'
data_dir = 'data/raw-data/'
merged_dir = 'data/by_stock_merged_top_caps/'
if not os.path.exists(merged_dir):
    os.makedirs(merged_dir)

# 读取股票代码列表
with open(json_file, 'r') as f:
    tickers = json.load(f)

def merge_data_for_ticker(ticker):
    all_data = []  # 存储单个股票的所有数据
    # 遍历每个年份的文件夹
    for year in os.listdir(data_dir):
        year_dir = os.path.join(data_dir, year)
        if os.path.isdir(year_dir):  # 确保是目录
            # 假设后缀可能是.SZ或.SH，尝试两种可能性
            for suffix in ['.SZ', '.SH']:
                file_path = os.path.join(year_dir, f"{ticker}{suffix}.csv")
                if os.path.isfile(file_path):  # 确保文件存在
                    data = pd.read_csv(file_path)
                    all_data.append(data)
                    break
    
    if all_data:
        # 合并数据
        merged_data = pd.concat(all_data)
        # 按交易时间排序
        merged_data.sort_values(by='trade_time', inplace=True)
        # 保存到merged文件夹
        merged_data.to_csv(os.path.join(merged_dir, f"{ticker}.csv"), index=False)

def main():
    max_workers = max(1, os.cpu_count() - 1)  # 保留一个CPU核心
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(merge_data_for_ticker, tickers)

if __name__ == "__main__":
    main()

### 单只股票数据检查<多线程>

In [None]:
import pandas as pd
from pathlib import Path
import concurrent.futures
import os

def check_data_points_and_nan(file_path):
    df = pd.read_csv(file_path)
    df['trade_time'] = pd.to_datetime(df['trade_time'])
    df['date'] = df['trade_time'].dt.date
    grouped = df.groupby('date').size()

    ticker = Path(file_path).stem  # 获取文件名作为股票代码
    invalid_dates = grouped[grouped != 240].index.tolist()
    nan_values = df.isna().any(axis=1)

    if invalid_dates or nan_values.any():
        return ticker, invalid_dates, df[nan_values]

    return None

def main():
    folder_path = 'path_to_your_data_folder'  # 更改为您的数据文件夹路径
    file_paths = Path(folder_path).glob('*.csv')

    max_workers = max(1, os.cpu_count() - 1)  # 保留一个CPU核心
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = executor.map(check_data_points_and_nan, file_paths)
        
        for result in results:
            if result:
                ticker, invalid_dates, nan_data = result
                print(f'Ticker: {ticker}, Invalid Dates: {invalid_dates}')
                if not nan_data.empty:
                    print(f'Non-numeric data found in Ticker: {ticker}')

if __name__ == "__main__":
    main()

### 获取历年流动性<多线程>

In [None]:
import os
import akshare as ak
import json
import pandas as pd
import numpy as np
import concurrent.futures
import os

def process_stock_data(year, ticker):
    # Prepare start and end dates
    start_date = f"{year}0101"
    end_date = f"{year}1231"

    try:
        # Fetch data from akshare
        turnover = ak.stock_zh_a_hist(symbol=ticker, start_date=start_date, end_date=end_date, adjust='')[['换手率']]
        amount = ak.stock_zh_a_hist(symbol=ticker, start_date=start_date, end_date=end_date, adjust='')[['成交额']]

        if turnover.empty or amount.empty:
            return None

        trading_days = len(turnover)
        daily_average_turnover = turnover['换手率'].mean() / 100
        circulating_market_cap = amount['成交额'].iloc[-1] / turnover['换手率'].iloc[-1] * 100

        return [ticker, daily_average_turnover, circulating_market_cap, trading_days]

    except Exception as e:
        return None

def main():
    try:
        with open('stock_data/tickers.json') as f:
            tickers = json.load(f)
    except Exception as e:
        print(f"Error reading tickers file: {e}")
        return

    years = range(2000, 2024)
    max_workers = max(1, os.cpu_count() - 1)  # 保留一个CPU核心

    for year in years:
        results = []
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(process_stock_data, year, ticker) for ticker in tickers]
            for future in concurrent.futures.as_completed(futures):
                result = future.result()
                if result:
                    results.append(result)

        if results:
            df = pd.DataFrame(results, columns=['tickers', 'turnover_ratio', 'circulating_market_cap', 'trading_days'])
            df.set_index('tickers', inplace=True)
            df.sort_values(by='turnover_ratio', ascending=False, inplace=True)
            filename = f'turnovers_{year}.csv'
            df.to_csv(filename)
            df.to_pickle(f'turnovers_{year}.pkl')
            print(f"Data for {year} saved to {filename}")

if __name__ == "__main__":
    main()