In [None]:

import os
import json
import requests
import traceback
from datetime import datetime
from pathlib import Path
from typing import Dict, Optional
from Arxiv_Parser.paper_parser import parse_html
from Arxiv_Parser.paper_storage import save_paper_data
from LLM.llm import MultiLLM
from Task_Conductor.prompts import RelevanceTask

class ProcessingConfig:
    """配置中心（简化版）"""
    DEFAULT_SAVE_DIR = "papers"
    FILENAME_TEMPLATE = "paper_{arxiv_id}.json"
    
    def __init__(self, root_dir: str = ".", custom_output: Optional[str] = None):
        self.root_dir = Path(root_dir).expanduser().resolve()
        self.custom_output = Path(custom_output) if custom_output else None
        self._init_paths()
    
    def _init_paths(self):
        """初始化路径（去除HTML缓存）"""
        self.STATE_DIR = self.root_dir / ".status"
        self.OUTPUT_DIR = self.root_dir / self.DEFAULT_SAVE_DIR
        
        self.STATE_DIR.mkdir(parents=True, exist_ok=True)
        self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    
    def get_output_path(self, arxiv_id: str) -> Path:
        """解析最终输出路径"""
        if self.custom_output:
            if self.custom_output.is_dir():
                return self.custom_output / self.FILENAME_TEMPLATE.format(arxiv_id=arxiv_id)
            return self.custom_output
        return self.OUTPUT_DIR / self.FILENAME_TEMPLATE.format(arxiv_id=arxiv_id)

class PaperProcessor:
    """增强型论文处理器（无HTML缓存）"""
    
    VERSION = "2.1"
    ENCODING = 'utf-8'
    
    def __init__(self, config: ProcessingConfig):
        self.config = config
        self.llm = MultiLLM('deepseek-coder')
        self._state = {
            "current_step": None,
            "metadata": {},
            "stats": {
                "sections": 0,
                "references": 0
            }
        }
    
    def process(self, url: str, keyword: Optional[str] = None) -> Path:
        """核心处理流程"""
        try:
            # 初始化元数据
            arxiv_id = self._extract_arxiv_id(url)
            output_path = self.config.get_output_path(arxiv_id)
            
            self._update_state({
                "metadata": {
                    "source_url": url,
                    "arxiv_id": arxiv_id,
                    "keyword": keyword,
                    "output_path": str(output_path),
                    "timestamp": datetime.now().isoformat()
                }
            })
            
            # 获取并处理内容
            print("🔄 获取论文内容...", end='', flush=True)
            response = requests.get(url)
            response.encoding = self.ENCODING
            response.raise_for_status()
            print("✅")
            
            # 解析内容
            print("🔍 解析论文结构...", end='', flush=True)
            paper_data = parse_html(response.text)
            self._update_state({
                "stats": {
                    "sections": len(paper_data.get("sections", [])),
                    "references": len(paper_data.get("references", []))
                }
            })
            print(f"✅ 找到 {self._state['stats']['sections']} 个章节")
            
            # 保存结果
            print(f"💾 保存到：{output_path}...", end='', flush=True)
            save_paper_data(
                paper_data, 
                str(output_path),
                encoding=self.ENCODING
            )
            print(f"✅ ({output_path.stat().st_size / 1024:.1f} KB)")
            
            # 关键词分析
            if keyword:
                print(f"🔎 分析关键词 '{keyword}' 相关性...")
                score = self._analyze_relevance(paper_data["abstract"], keyword)
                print(f"⭐ 相关性评分：{score}/1.0")
            
            return output_path
            
        except Exception as e:
            error_info = {
                "step": self._state.get("current_step"),
                "error_type": type(e).__name__,
                "message": str(e),
                "traceback": traceback.format_exc()
            }
            print(f"\n❌ 处理失败：{error_info['message']}")
            raise RuntimeError(json.dumps(error_info, ensure_ascii=False)) from e
    
    def _analyze_relevance(self, abstract: str, keyword: str) -> float:
        """执行相关性分析"""
        task = RelevanceTask(abstract, keyword)
        response = self.llm.ask(task.generate_prompt())
        return task.parse_model_output(response)
    
    @staticmethod
    def _extract_arxiv_id(url: str) -> str:
        """增强型ID提取"""
        base_id = url.split("/")[-1]
        for substr in ["v1", "html/", "pdf/"]:
            base_id = base_id.replace(substr, "")
        return base_id.strip("/")
    
    def _update_state(self, update_dict: dict):
        """更新状态"""
        self._state.update(update_dict)

# 在paper_storage.py中增强保存函数
def save_paper_data(data: dict, filename: str, encoding='utf-8'):
    """增强保存函数"""
    Path(filename).parent.mkdir(parents=True, exist_ok=True)
    with open(filename, 'w', encoding=encoding, errors='replace') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


In [None]:
config = ProcessingConfig(
    root_dir=r"C:\Users\Inuyasha\Programs\Python\AIGC\Arxiv_Reviewer/research_papers"
)

processor = PaperProcessor(config)
result_path = processor.process(
    url="https://arxiv.org/html/2501.00092v1",
    keyword="AI"
)


🔄 获取论文内容...