In [7]:
import os
import re
import pandas as pd
import jieba
from tqdm import tqdm

# 定义要统计的关键词列表
keywords = ["人工智能","商业智能", "图像理解", "投资决策辅助系统", "智能数据分析", "智能机器人", "机器学习", "深度学习", "语义搜索", "生物识别技术", "人脸识别", "语音识别", "身份验证", "自动驾驶", "自然语言处理", "大数据", "数据挖掘", "文本挖掘", "数据可视化", "异构数据", "征信", "增强现实", "混合现实", "虚拟现实", "云计算", "流计算", "图计算", "内存计算", "多方安全计算", "类脑计算", "绿色计算", "认知计算", "融合架构", "亿级并发", "EB 级存储", "物联网", "信息物理系统", "区块链", "数字货币", "分布式计算", "差分隐私技术", "智能金融合约", "移动互联网", "工业互联网", "移动互联", "互联网医疗", "电子商务", "移动支付", "第三方支付", "NFC 支付", "智能能源", "B2B", "B2C", "C2B", "C2C", "O2O", "网联", "智能穿戴", "智慧农业", "智能交通", "智能医疗", "智能客服", "智能家居", "智能投顾", "智能文旅", "智能环保", "智能电网", "智能营销", "数字营销", "无人零售", "互联网金融", "数字金融", "Fintech", "金融科技", "量化金融", "开放银行"]

# 提前创建好外层的公司代码字典结构，避免每次都判断是否存在
result_dict = {}

#扩增词典
for eachadd in keywords:
    jieba.add_word(eachadd)

# 遍历文件夹
folder_path = "C:/Users/myc/Desktop/A股年报TXT"  # 将这里替换为实际的文件夹路径
file_paths = [os.path.join(folder_path, file_name) for file_name in os.listdir(folder_path) if file_name.endswith(".txt")]
os.chdir(folder_path)

with tqdm(total=len(file_paths), desc="处理文件进度") as pbar:
    for file_path in file_paths:
        # 从文件名中提取公司代码（数字形式）和年份信息
        file_name = os.path.basename(file_path)
        match = re.match(r"(\d+)_(\d{4})_.+", file_name)
        if match:
            company_code = match.group(1)
            year = match.group(2)
            if company_code not in result_dict:
                result_dict[company_code] = {}
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                words = jieba.lcut(content)
                text_word_count = len(words)
                if year not in result_dict[company_code]:
                    result_dict[company_code][year] = {}
                    result_dict[company_code][year]["总词数"] = 0
                for keyword in keywords:
                    count = words.count(keyword)
                    result_dict[company_code][year][keyword] = count
                    result_dict[company_code][year]["总词数"] += count
                result_dict[company_code][year]["文本词数"] = text_word_count
        pbar.update(1)

data = []
for company_code, years_data in result_dict.items():
    for year, keyword_counts in years_data.items():
        row = {"公司代码": company_code, "年份": year}
        row.update(keyword_counts)
        data.append(row)

df = pd.DataFrame(data)
df.to_csv("result.csv", index=False)

处理文件进度: 100%|██████████| 11957/11957 [2:03:31<00:00,  1.61it/s] 
