In [1]:
import os
import requests
import pandas as pd
import time
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from dotenv import load_dotenv  # ✅ 加载 .env 所需

# ✅ 加载环境变量（确保你本地有 model.env 文件）
load_dotenv("qw3b.env")  # 指定文件名

# ✅ 从环境变量中获取 API_KEY 和模型名
api_key = os.getenv("API_KEY")
model_name = os.getenv("MODEL_NAME")

if not api_key or not model_name:
    raise ValueError("❌ 缺少 API_KEY 或 MODEL_NAME，请检查 model.env 文件是否设置正确")

url = "https://api.siliconflow.cn/v1/chat/completions"
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}
# ✅ 输入输出路径
input_folder = "/Users/yukun/Desktop/mda/MDA训练集"
output_csv_path = "519qw3b_text.csv"

# ✅ 提取字段
output_fields = [
    "文件名",
    "董事会成员变动",
    "董秘变更",
    "高级管理层变更",
    "重大资产重组",
    "控股股东/实际控制人变更",
    "子公司或重要分支机构重大变化",
    "对财务状况的分析",
    "营业收入增长情况",
    "净利润变动分析",
    "毛利率变动分析",
    "销售费用率变化",
    "管理费用控制情况",
    "经营性现金流变化趋势",
    "应收账款变化与风险",
    "资产负债结构调整",
    "利润质量分析（是否依赖非经常性损益）",
    "核心竞争力分析（品牌、成本、效率等）",
    "市场拓展与业务布局进展",
    "区域/产品结构优化情况",
    "供应链/渠道建设进展",
    "战略合作与外部联盟情况",
    "投资项目执行及回报分析",
    "企业文化建设与团队稳定性",
    "对标管理与精细化运营成果",
    "技术创新与研发投入",
    "新产品/新技术推出情况",
    "研发费用占比变化",
    "专利数量与知识产权布局",
    "智能制造/数字化转型进展",
    "行业竞争格局分析",
    "行业发展趋势与机会识别",
    "国家政策影响与公司应对",
    "未来经营计划与增长目标",
    "订单/客户/签约情况展望",
    "海外市场/出口业务计划",
    "对经济周期或宏观环境的判断",
    "面临的主要风险因素及变化",
    "政策/监管风险与公司响应",
    "环保合规/碳排放压力",
    "土地/原材料/人工成本波动风险",
    "资金链紧张预警",
    "核心客户集中度及流失风险",
    "可持续发展战略与目标",
    "社会责任履行与公益活动",
    "节能降耗/绿色生产措施"
]

# ✅ Prompt模板
def build_prompt(text):
    return f"""
以下是某公司年报中的“管理层讨论与分析”部分内容，请你遵循以下要求处理该文本：

1. 删除冗余性强的空话、套话。
2. 仅保留具有研究价值的具体内容，尤其是与公司治理和业绩相关的信息。
3. 请严格按照以下结构模板输出内容，每个条目都必须出现，即使原文没有相关内容，也请写明“无相关信息”或“未提及”。

【固定输出结构如下】：
{chr(10).join([f"- 【{field}】：" for field in output_fields[1:]])}

原文如下：
{text}
"""

# ✅ 每个文件的处理函数
def process_file(file_path):
    filename = os.path.basename(file_path)
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

        prompt = build_prompt(text)
        data = {
            "model": model_name,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.2,
            "max_tokens": 2048
        }

        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        response_data = response.json()

        if "choices" not in response_data:
            raise ValueError("接口返回中未包含choices字段，可能是模型名称错误或token用尽")

        content = response_data["choices"][0]["message"]["content"]
        row = [filename]

        for field in output_fields[1:]:
            key = f"- 【{field}】："
            start = content.find(key)
            if start != -1:
                end = content.find("- 【", start + 1)
                value = content[start + len(key):end].strip() if end != -1 else content[start + len(key):].strip()
            else:
                value = "未提及"
            row.append(value)

        print(f"✔ 完成：{filename}")
        return row

    except Exception as e:
        print(f"✘ 出错：{filename}，错误：{e}")
        return [filename] + ["ERROR"] * (len(output_fields) - 1)

# ✅ 并发运行
def run_parallel():
    file_paths = [
        os.path.join(input_folder, f)
        for f in os.listdir(input_folder)
        if f.endswith(".txt")
    ]

    results = []

    with ThreadPoolExecutor(max_workers=5) as executor:  # 控制并发线程数
        future_to_file = {executor.submit(process_file, file): file for file in file_paths}

        for future in as_completed(future_to_file):
            result = future.result()
            results.append(result)

    df = pd.DataFrame(results, columns=output_fields)
    df.to_csv(output_csv_path, index=False, encoding="utf-8-sig")
    print(f"\n✅ 全部处理完成，结果保存在：{output_csv_path}")

if __name__ == "__main__":
    run_parallel()

✔ 完成：895-双汇发展-2021.txt
✔ 完成：963-华东医药-2021.txt
✔ 完成：2475-立讯精密-2021.txt
✔ 完成：2429-兆驰股份-2021.txt
✔ 完成：600031-三一重工-2021.txt
✔ 完成：2552-宝鼎科技-2021.txt
✔ 完成：300339-润和软件-2021.txt
✔ 完成：600566-济川药业-2021.txt
✔ 完成：2507-涪陵榨菜-2021.txt
✔ 完成：2415-海康威视-2021.txt
✔ 完成：425-徐工机械-2021.txt
✔ 完成：2241-歌尔股份-2021.txt
✘ 出错：600519-贵州茅台-2021.txt，错误：HTTPSConnectionPool(host='api.siliconflow.cn', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by ProxyError('Unable to connect to proxy', RemoteDisconnected('Remote end closed connection without response')))✘ 出错：2230-科大讯飞-2021.txt，错误：HTTPSConnectionPool(host='api.siliconflow.cn', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by ProxyError('Unable to connect to proxy', RemoteDisconnected('Remote end closed connection without response')))
✘ 出错：600276-恒瑞医药-2021.txt，错误：HTTPSConnectionPool(host='api.siliconflow.cn', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by ProxyError('Unable to connect to proxy',

In [3]:
print(f"API_KEY: {api_key[:4]}... (长度: {len(api_key)})")
print(f"Model Name: {model_name}")

API_KEY: sk-v... (长度: 51)
Model Name: Qwen/Qwen3-8B


In [19]:
def process_file(file_path):
    filename = os.path.basename(file_path)
    print(f"开始处理文件：{filename}")

    try:
        print("开始读取文件内容...")
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        print(f"文件读取完毕，长度：{len(text)} 字符")

        print("开始构建 prompt ...")
        prompt = build_prompt(text)
        print(f"Prompt 构建完成，长度：{len(prompt)} 字符")

        data = {
            "model": model_name,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.2,
            "max_tokens": 2048
        }

        print("开始发送请求到模型接口...")
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        print("请求成功，解析返回数据...")
        response_data = response.json()

        if "choices" not in response_data:
            raise ValueError("接口返回中未包含choices字段，可能是模型名称错误或token用尽")

        content = response_data["choices"][0]["message"]["content"]
        print(f"提取到模型返回内容，长度：{len(content)} 字符")
        row = [filename]

        for field in output_fields[1:]:
            key = f"- 【{field}】："
            start = content.find(key)
            if start != -1:
                end = content.find("- 【", start + 1)
                value = content[start + len(key):end].strip() if end != -1 else content[start + len(key):].strip()
            else:
                value = "未提及"
            row.append(value)

        print(f"✔ 完成：{filename}")
        return row

    except Exception as e:
        print(f"✘ 出错：{filename}，错误：{e}")
        return [filename] + ["ERROR"] * (len(output_fields) - 1)
