## 2025/11/15 线上练习

In [15]:
import os
import fitz  # PyMuPDF
import re
from tqdm import tqdm

# ========= 路径设置 =========
pdf_dir = "./papers"    # 输入 PDF 文件夹
txt_dir = "./papers"    # 输出 TXT 文件夹

os.makedirs(txt_dir, exist_ok=True)

# ========= PDF → 文本提取 =========
def pdf_to_text(pdf_path):
    """提取 PDF 文本"""
    text = ""
    try:
        doc = fitz.open(pdf_path)  # 打开 PDF 文件
        for page in doc:
            text += page.get_text("text")  # 提取纯文本
        doc.close()
    except Exception as e:
        print(f"❌ 无法读取 {pdf_path}：{e}")
    return text

# ========= 文本清洗 =========
def clean_text(text):
    """基础清洗：去页眉页脚、参考文献、空行等"""
    text = re.sub(r"Page \d+ of \d+", "", text)  # 删除页码
    text = re.sub(r"\n{3,}", "\n\n", text)       # 合并多余空行
    text = re.sub(r"\s{2,}", " ", text)          # 合并多余空格
    # 去掉 References 后的内容
    if "References" in text:
        text = text.split("References")[0]
    elif "REFERENCES" in text:
        text = text.split("REFERENCES")[0]
    return text.strip()

# ========= 主函数：批量处理 =========
def process_all_pdfs():
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith(".pdf")]
    for fn in tqdm(pdf_files, desc="Extracting and cleaning PDFs"):
        pdf_path = os.path.join(pdf_dir, fn)
        txt_path = os.path.join(txt_dir, fn.replace(".pdf", ".txt"))

        raw_text = pdf_to_text(pdf_path)
        cleaned_text = clean_text(raw_text)

        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

    print(f"✅ 已处理 {len(pdf_files)} 篇 PDF，输出到 {txt_dir}/")


## ========= 程序入口 =========
#if __name__ == "__main__": 
process_all_pdfs()


Extracting and cleaning PDFs: 100%|███████████████| 1/1 [00:00<00:00, 11.88it/s]

✅ 已处理 1 篇 PDF，输出到 ./papers/





In [13]:
#! pip install fitz
#! pip install PyMuPDF
#! pip install frontend

In [12]:
import os
import json
import requests
from tqdm import tqdm
from dotenv import load_dotenv

# 1️⃣ 读取配置
load_dotenv()
api_key = os.getenv("API_KEY")              # 你的 SiliconFlow API 密钥
base_url = os.getenv("API_BASE_URL")        # 一般是 https://api.siliconflow.cn/v1
model_name = os.getenv("API_MODEL")         # 比如 deepseek-ai/DeepSeek-V3 或 Qwen/Qwen2.5-14B-Instruct

# 2️⃣ 创建 headers
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

# 3️⃣ 读取模板
with open("template.json", "r", encoding="utf-8") as f:
    template = f.read()

# 4️⃣ 创建输出文件夹
os.makedirs("outputs", exist_ok=True)

# 5️⃣ 遍历论文文件
for filename in tqdm(os.listdir("papers"), desc="Processing papers"):
    if not filename.endswith(".txt"):
        continue

    with open(os.path.join("papers", filename), "r", encoding="utf-8") as f:
        text = f.read()

    # 构造提示词（Prompt）
    prompt = f"""
你是一名材料科学文献分析助手，请根据以下论文内容，
按照模板提取关键信息并输出严格的 JSON 格式结果。
不要解释，不要多余说明，只输出 JSON。

【模板】
{template}

【论文文本】
{text}
"""

    # 6️⃣ 调用 SiliconFlow API
    try:
        payload = {
            "model": model_name,
            "messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            "temperature": 0.0,
            "max_tokens": 4096
        }
    
        response = requests.post(f"{base_url}/chat/completions", headers=headers, json=payload)
     
        if response.status_code == 200:
            result = response.json()["choices"][0]["message"]["content"]
            # 保存结果
            output_path = os.path.join("outputs", filename.replace(".txt", "_output.json"))
            with open(output_path, "w", encoding="utf-8") as out:
                #json.dump(json.loads(result), out, ensure_ascii=False, indent=2)
                out.write(result)
        else:
            print(f"❌ 处理 {filename} 时出错：Error code {response.status_code} - {response.text}")

    except Exception as e:
        print(f"❌ 处理 {filename} 时出错：{e}")

print("✅ 所有文献已处理完成！请查看 outputs/ 文件夹。")


Processing papers: 100%|██████████████████████████| 2/2 [00:10<00:00,  5.10s/it]

✅ 所有文献已处理完成！请查看 outputs/ 文件夹。



