### 1.正则提取沪深300股票编码

In [None]:
import re


with open('input.txt', 'r', encoding='utf-8') as f:  
    content = f.read()


pattern = r'[（(](\d{6})[)）]'  
stock_codes = re.findall(pattern, content)


valid_codes = [code for code in stock_codes if code.isdigit()]


with open('output.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(valid_codes))


print("匹配结果:", valid_codes)    


### 2.合并文件后按照公司分组

In [None]:
import pandas as pd
import os
import glob
from tqdm import tqdm  

def process_excel_files(input_dir, output_dir):

    os.makedirs(output_dir, exist_ok=True)
    
    all_files = glob.glob(os.path.join(input_dir, "*.xlsx"))
    all_files = [f for f in all_files if not os.path.basename(f).startswith('~$')]

    dfs = []
    for file in tqdm(all_files, desc="读取文件中"):
        try:
            df = pd.read_excel(file, parse_dates=['Trddt'], dtype={'Stkcd': str})
            dfs.append(df)
        except Exception as e:
            print(f"文件 {os.path.basename(file)} 读取失败: {str(e)}")
    
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df.sort_values(by='Trddt', inplace=True)

    grouped = combined_df.groupby('Stkcd')
    for stkcd, group in tqdm(grouped, desc="写入分组文件"):
        safe_stkcd = str(stkcd).replace('/', '_')  
        output_path = os.path.join(output_dir, f"{safe_stkcd}.xlsx")
        
        group.to_excel(output_path, index=False, engine='openpyxl')

if __name__ == "__main__":
    input_folder = "/path/to/input_folder" 
    output_folder = "/path/to/output_folder" 
    
    process_excel_files(input_folder, output_folder)


### 3.按照公司计算新的列

In [None]:
import pandas as pd
import os
from tqdm import tqdm 

def process_stock_data(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    
    for filename in tqdm(os.listdir(input_folder), desc="处理文件中"):
        if not filename.endswith('.xlsx'):
            continue
            
        file_path = os.path.join(input_folder, filename)
        
        df = pd.read_excel(file_path)
        df = df.sort_values('Trddt')  
        
        df['prev_clsprc'] = df.groupby('Stkcd')['Clsprc'].shift(1)
        
        df['zhenfu'] = (df['Hiprc'] - df['Loprc']) / df['prev_clsprc']
        df['zhangdiee'] = df['Clsprc'] - df['prev_clsprc']
        
        df.drop(columns=['prev_clsprc'], inplace=True)
        
        df.fillna({'zhenfu': 0, 'zhangdiee': 0}, inplace=True)
        
        output_path = os.path.join(output_folder, f"processed_{filename}")
        df.to_excel(output_path, index=False, engine='openpyxl')

if __name__ == "__main__":
    process_stock_data(
        input_folder="E:/2412毕业论文/0104数据特征/csmar2/汇总/按公司分组", 
        output_folder="E:/2412毕业论文/0104数据特征/csmar2/汇总/按公司分组/处理后2" 
    )


处理文件中:  99%|█████████▉| 302/304 [06:10<00:02,  1.23s/it]


TypeError: unsupported operand type(s) for -: 'str' and 'str'