In [1]:
%pip install pdfplumber tqdm
%pip install pandas tqdm tenacity ipython
%pip install PyMuPDF

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
from openai import OpenAI
from IPython.display import Markdown, display, HTML
import pandas as pd
from tenacity import retry, stop_after_attempt, wait_exponential
import openpyxl
import json
import os
import time
import shutil
from pathlib import Path
import os
import pdfplumber
from tqdm import tqdm


In [2]:
# 读取包含敏感信息（如 API 密钥）的文本文件
def load_info(file_name):
    try:
        with open(file_name + ".txt", "r", encoding='utf-8') as file:
            info = file.read().strip()
        return info
    except FileNotFoundError:
        raise Exception("{}.txt not found. Please create it.".format(file_name))
    except Exception as e:
        raise Exception(f"Error reading info: {str(e)}")

# 初始化 Moonshot API 客户端。  
client = OpenAI(
        # put your API in kimiapi.txt
        api_key=load_info("kimiapi"),
        base_url = "https://api.moonshot.cn/v1",
)
# 将 API 响应的元数据（如完成原因、令牌使用量）记录到 records.txt 并打印。
def write_content(chat_completion, filename):
    line = f"{filename}\n Finished reason: {chat_completion.choices[0].finish_reason}, completion token: {chat_completion.usage.completion_tokens}, total token: {chat_completion.usage.total_tokens}.\n"     
    with open('records.txt', 'a') as f:        
        f.write(line)
    print(line)   
# 向 Moonshot API 发送请求，处理 PDF 内容并返回 JSON 格式的响应。
def get_openai_response(file_content, prompt, filename, model = "moonshot-v1-32k", max_tokens = 2500):   
    
    chat_completion = client.chat.completions.create(
    messages=[
        {"role": "system", "content": file_content},
        {"role": "system", "content": prompt}, 
        {"role": "user", "content": 'Please read the pdf file and return the information in json format. Thank you.'}, 
    ],
    model= model, # "gpt-3.5-turbo", #"gpt-4o-mini",
    temperature=0.3, # smaller is more precise, larger gets creative, 0.3 is the default and works well for kimi
    max_tokens=max_tokens
    )
    write_content(chat_completion, filename)   
     
    return chat_completion.choices[0].message.content
    
def save_to_excel(json_data, filename):
    number = filename.split('.')[0]
    data = json.loads(json_data)
    all_v = []
    for item in data.values():            
        if isinstance(item, list):
            all_v.extend(item)
        elif isinstance(item, dict):
            all_v.extend([item])       
    df = pd.DataFrame(all_v)
    df['FileNumber'] = number

    if not os.path.exists(excel_file):
        df.to_excel(excel_file, index=False, sheet_name=filename)
        print(f"Created a new Excel file at {excel_file}")
    else:
        with pd.ExcelWriter(excel_file, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:
            df.to_excel(writer, index=False, sheet_name=filename)            
    print(f"Data saved successfully to {filename}")
        
# 处理单个文件，调用 API 获取响应，保存到 Excel，并将 JSON 响应保存到文件。
def process_file(file_content,filename):
    try:
        json_response = get_openai_response(file_content, prompt, filename)
        save_to_excel(json_response, filename)
        return True
    except Exception as e:
        print(f"{str(e)}, retrying with 128k model...")        
        try:
            json_response = get_openai_response(file_content, prompt, filename, "moonshot-v1-128k", 5000)
            save_to_excel(json_response, filename)
            return True
        except Exception as e:
            print(f"Error {str(e)} with 128k model...")
                
    with open(f"temp_output/{filename}.json", "w", encoding="utf-8") as f:
        f.write(json_response)  

# 使用 pdfplumber 本地读取 PDF 文件
def read_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""
        return text.strip()
    except Exception as e:
        raise Exception(f"读取 PDF 文件 {file_path} 失败: {str(e)}")

# 显示动态HTML进度条
def show_progress_bar(progress, total, elapsed=None, remaining=None):
    """显示动态HTML进度条"""
    progress_html = f"""
    <div style="width:100%;border:1px solid #ccc;padding:3px;margin-bottom:10px">
        <div style="width:{progress/total*100}%;background:#4CAF50;height:20px"></div>
    </div>
    <p style="font-family:Arial;font-size:12px">
        进度: {progress}/{total} | 已用时: {elapsed:.1f}s | 剩余: {remaining:.1f}s
    </p>
    """
    display(HTML(progress_html))  # 使用已导入的HTML

# 生成Markdown格式的输出
def generate_markdown_output(content, is_error=False):
    
    style = "color:red;" if is_error else "color:green;"
    icon = "❌" if is_error else "✅"
    display(Markdown(f"<span style='{style}'>{icon} {content}</span>"))

In [3]:
# 配置
type = "3"
prompt = load_info("paper/1.prompt/prompt"+type)
excel_file = "output/output" + type + ".xlsx"

file_df = pd.read_excel("info/file_list.xlsx")

pdf_folder = "trimm"

# 加载文件列表
try:
    file_df = pd.read_excel("info/file_list.xlsx")
    files_md = "## 已加载文件列表\n"
    files_md += "| 文件名 | 类型 |\n|--------|------|\n"
    for _, row in file_df.iterrows():
        files_md += f"| {row['filename']} | {row.get('Type', 'N/A')} |\n"
    display(Markdown(files_md))
    generate_markdown_output(f"成功加载 {len(file_df)} 个文件", is_error=False)
except Exception as e:
    generate_markdown_output(f"加载 file_list.xlsx 失败: {str(e)}", is_error=True)  # 显式设置 is_error=True
    raise

# 检查现有 Excel 工作表以避免重复处理
if os.path.exists(excel_file):
    sheet_names = pd.ExcelFile(excel_file).sheet_names
    generate_markdown_output(f"在 {excel_file} 中找到 {len(sheet_names)} 个现有工作表")
else:
    sheet_names = []
    generate_markdown_output(f"在 {excel_file} 未找到现有 Excel 文件")

# 确保输出目录存在
os.makedirs("output", exist_ok=True)

# 处理文件
total_files = len(file_df)
start_time = time.time()

for index, row in tqdm(file_df.iterrows(), total=len(file_df), desc="处理 PDF 文件"):
    filename = row['filename'].replace('_'+row['Type'], '').strip()

    # 更新进度条
    elapsed = time.time() - start_time
    avg_time = elapsed / (index + 1)
    remaining = avg_time * (total_files - index - 1)
    show_progress_bar(index + 1, total_files, elapsed, remaining)

    if filename not in sheet_names:
        pdf_path = os.path.join(pdf_folder, row['filename'])
        generate_markdown_output(f"正在处理文件: {filename} ({pdf_path})")
        
        file_start_time = time.time()
        try:
            # 本地读取 PDF
            file_content = read_pdf(pdf_path)
            generate_markdown_output(f"从 {pdf_path} 读取 PDF 内容 (长度: {len(file_content)} 字符)")
            
            # 处理文件
            process_file(file_content, filename, )
            generate_markdown_output(f"成功处理 {filename}，耗时 {time.time() - start_time:.2f} 秒")
            
            # 添加延迟以避免 API 速率限制
            time.sleep(1)
            
        except Exception as e:
            generate_markdown_output(f"处理 {filename} 失败: {str(e)}",is_error = True)
            continue

# 最终完成信息
generate_markdown_output(f"所有文件处理完成！总耗时: {time.time() - start_time:.2f}秒")

## 已加载文件列表
| 文件名 | 类型 |
|--------|------|
| 1-s2.0-S0269749118332950-mmc1.pdf | supplementary material |
| 1-s2.0-S0269749119345816-mmc1.pdf | supplementary material |
| 11356_1_paper.pdf | paper.pdf |
| 11356_paper.pdf | paper.pdf |
| 北京市再生水补水河流中抗生素的赋存特征及生态风险评估_武亚林.pdf | Chinese research paper |
| 补充材料.pdf | supplementary material |
| 补充材料1.pdf | supplementary material |
| 补充来料2.pdf | supplementary material |


<span style='color:green;'>✅ 成功加载 8 个文件</span>

<span style='color:green;'>✅ 在 output/output3.xlsx 未找到现有 Excel 文件</span>

处理 PDF 文件:   0%|          | 0/8 [00:00<?, ?it/s]

<span style='color:green;'>✅ 正在处理文件: 1-s2.0-S0269749118332950-mmc1.pdf (trimm\1-s2.0-S0269749118332950-mmc1.pdf)</span>

<span style='color:green;'>✅ 从 trimm\1-s2.0-S0269749118332950-mmc1.pdf 读取 PDF 内容 (长度: 14760 字符)</span>

1-s2.0-S0269749118332950-mmc1.pdf
 Finished reason: length, completion token: 2500, total token: 10562.

Expecting value: line 1 column 1 (char 0), retrying with 128k model...
1-s2.0-S0269749118332950-mmc1.pdf
 Finished reason: stop, completion token: 1516, total token: 9578.

Error Expecting value: line 1 column 1 (char 0) with 128k model...


<span style='color:green;'>✅ 成功处理 1-s2.0-S0269749118332950-mmc1.pdf，耗时 84.29 秒</span>

处理 PDF 文件:  12%|█▎        | 1/8 [01:25<09:56, 85.21s/it]

<span style='color:green;'>✅ 正在处理文件: 1-s2.0-S0269749119345816-mmc1.pdf (trimm\1-s2.0-S0269749119345816-mmc1.pdf)</span>

<span style='color:green;'>✅ 从 trimm\1-s2.0-S0269749119345816-mmc1.pdf 读取 PDF 内容 (长度: 19508 字符)</span>

1-s2.0-S0269749119345816-mmc1.pdf
 Finished reason: stop, completion token: 1621, total token: 11221.

Expecting value: line 1 column 1 (char 0), retrying with 128k model...
1-s2.0-S0269749119345816-mmc1.pdf
 Finished reason: stop, completion token: 1884, total token: 11484.

Created a new Excel file at output/output3.xlsx
Data saved successfully to 1-s2.0-S0269749119345816-mmc1.pdf




<span style='color:green;'>✅ 成功处理 1-s2.0-S0269749119345816-mmc1.pdf，耗时 157.19 秒</span>

处理 PDF 文件:  25%|██▌       | 2/8 [02:38<07:47, 77.96s/it]

<span style='color:green;'>✅ 正在处理文件: 11356_1 (trimm\11356_1_paper.pdf)</span>

<span style='color:green;'>✅ 从 trimm\11356_1_paper.pdf 读取 PDF 内容 (长度: 12593 字符)</span>

11356_1
 Finished reason: length, completion token: 2500, total token: 9521.

Unterminated string starting at: line 246 column 7 (char 7781), retrying with 128k model...
11356_1
 Finished reason: stop, completion token: 2385, total token: 9406.

Data saved successfully to 11356_1




<span style='color:green;'>✅ 成功处理 11356_1，耗时 253.73 秒</span>

处理 PDF 文件:  38%|███▊      | 3/8 [04:14<07:12, 86.45s/it]

<span style='color:green;'>✅ 正在处理文件: 11356 (trimm\11356_paper.pdf)</span>

<span style='color:green;'>✅ 从 trimm\11356_paper.pdf 读取 PDF 内容 (长度: 38716 字符)</span>

11356
 Finished reason: stop, completion token: 1628, total token: 12848.

Expecting value: line 1 column 1 (char 0), retrying with 128k model...
11356
 Finished reason: stop, completion token: 1686, total token: 12906.

Error Expecting value: line 1 column 1 (char 0) with 128k model...


<span style='color:green;'>✅ 成功处理 11356，耗时 327.52 秒</span>

处理 PDF 文件:  50%|█████     | 4/8 [05:28<05:25, 81.45s/it]

<span style='color:green;'>✅ 正在处理文件: 北京市再生水补水河流中抗生素的赋存特征及生态风险评估_武亚林.pdf (trimm\北京市再生水补水河流中抗生素的赋存特征及生态风险评估_武亚林.pdf)</span>

<span style='color:green;'>✅ 从 trimm\北京市再生水补水河流中抗生素的赋存特征及生态风险评估_武亚林.pdf 读取 PDF 内容 (长度: 30673 字符)</span>

北京市再生水补水河流中抗生素的赋存特征及生态风险评估_武亚林.pdf
 Finished reason: length, completion token: 2500, total token: 18006.

Unterminated string starting at: line 201 column 18 (char 5344), retrying with 128k model...
北京市再生水补水河流中抗生素的赋存特征及生态风险评估_武亚林.pdf
 Finished reason: stop, completion token: 3085, total token: 18591.

Data saved successfully to 北京市再生水补水河流中抗生素的赋存特征及生态风险评估_武亚林.pdf




<span style='color:green;'>✅ 成功处理 北京市再生水补水河流中抗生素的赋存特征及生态风险评估_武亚林.pdf，耗时 474.37 秒</span>

处理 PDF 文件:  62%|██████▎   | 5/8 [07:55<05:15, 105.03s/it]

<span style='color:green;'>✅ 正在处理文件: 补充材料.pdf (trimm\补充材料.pdf)</span>

<span style='color:green;'>✅ 从 trimm\补充材料.pdf 读取 PDF 内容 (长度: 1952 字符)</span>

补充材料.pdf
 Finished reason: length, completion token: 2500, total token: 3654.

Expecting value: line 1 column 1 (char 0), retrying with 128k model...
补充材料.pdf
 Finished reason: length, completion token: 5000, total token: 6154.

Error Unterminated string starting at: line 576 column 24 (char 15269) with 128k model...


<span style='color:green;'>✅ 成功处理 补充材料.pdf，耗时 631.72 秒</span>

处理 PDF 文件:  75%|███████▌  | 6/8 [10:32<04:05, 122.82s/it]

<span style='color:green;'>✅ 正在处理文件: 补充材料1.pdf (trimm\补充材料1.pdf)</span>

<span style='color:green;'>✅ 从 trimm\补充材料1.pdf 读取 PDF 内容 (长度: 11931 字符)</span>

补充材料1.pdf
 Finished reason: stop, completion token: 1835, total token: 7132.

Expecting value: line 1 column 1 (char 0), retrying with 128k model...
补充材料1.pdf
 Finished reason: stop, completion token: 1704, total token: 7001.

Data saved successfully to 补充材料1.pdf




<span style='color:green;'>✅ 成功处理 补充材料1.pdf，耗时 707.90 秒</span>

处理 PDF 文件:  88%|████████▊ | 7/8 [11:48<01:47, 107.57s/it]

<span style='color:green;'>✅ 正在处理文件: 补充来料2.pdf (trimm\补充来料2.pdf)</span>

<span style='color:green;'>✅ 从 trimm\补充来料2.pdf 读取 PDF 内容 (长度: 17735 字符)</span>

补充来料2.pdf
 Finished reason: stop, completion token: 2215, total token: 13609.

Data saved successfully to 补充来料2.pdf




<span style='color:green;'>✅ 成功处理 补充来料2.pdf，耗时 753.84 秒</span>

处理 PDF 文件: 100%|██████████| 8/8 [12:34<00:00, 94.34s/it] 


<span style='color:green;'>✅ 所有文件处理完成！总耗时: 754.84秒</span>