In [1]:
import pandas as pd
import sys
import os
import json
from tqdm import tqdm
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/Major/Math')
import asyncio
from call_api import call_gemini, async_call_gemini
from tqdm.asyncio import tqdm_asyncio

field_name = "课程网址"
field_path = f"/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/Major/Math/fields_csv/{field_name}.csv"
field_df = pd.read_csv(field_path)

field_df.shape

(454, 9)

In [2]:
prompt_template = """
You are an assistant whose only task is to return the **URL of the official page that lists this program’s curriculum / required and elective courses**.

────────────────────────────────────────────────────────
How to find the correct URL

1. **Primary *.edu* sources only**  
   • Examine the Admissions and Program URLs provided.  
   • You may open other pages within the same university’s *.edu* domain (e.g., “Curriculum”, “Course List”, “Degree Requirements”, “Program Handbook”).  
   ⛔  Ignore non-*.edu* sites, PDFs without a stable web address, blogs, or rankings.

2. **Optional Google search**  
   Query once:  
   "{university} {department} {degree} {program} curriculum courses degree requirements site:.edu"  
   Follow *.edu* results until you locate the page that explicitly lists the program’s required and elective courses.

3. **Selection rules**  
   • Choose **one** URL (https://…) that best displays the full course list for this program.  
   • If several pages exist, prefer the most specific (program-level) page over school-wide catalogs.  
   • Do **not** perform any summarization or explanation.  
   • If no suitable *.edu* URL is found, output **Not found**.  
   • Never invent a link or return a invalid link

────────────────────────────────────────────────────────
⚠️  Output format (exactly one line, no quotes, no extra text)  

Valid examples:  
https://engineering.university.edu/department/ms-data-science/curriculum  
Not found  

────────────────────────────────────────────────────────
Pages to consult first:
• Admissions URL: {admissions_url}  
• Program URL:    {program_url}

What is the URL that shows the program’s course requirements?
"""



In [3]:
import os
import json
import asyncio
from tqdm.asyncio import tqdm_asyncio

# Async Gemini wrapper
from call_api import async_call_gemini

# ---------------------------------------------------------------------------
# Concurrency guard – avoid hitting rate-limits
# ---------------------------------------------------------------------------
semaphore = asyncio.Semaphore(3)            # max concurrent rows

# ---------------------------------------------------------------------------
# Per-row worker
# ---------------------------------------------------------------------------
async def process_row(row, prompt_template, num_vote: int, model_name: str):
    """
    1. Format the prompt for this row
    2. Launch `num_vote` Gemini calls in parallel
    3. Capture BOTH normal answers *and* every possible error case
    4. Return a serialisable record
    """
    async with semaphore:
        row    = row.to_dict()
        prompt = prompt_template.format(
            university     = row["大学英文名称"],
            degree         = row["学位"],
            program        = row["专业英文名称"],
            department     = row["所属院系（英文）"],
            admissions_url = row["招生网址"],
            program_url    = row["专业网址"],
        )

        record: dict = row.copy()
        record["llm_reponses"] = {}

        # -------- launch Gemini calls in parallel --------------------
        tasks = [
            async_call_gemini(
                prompt,
                model_name=model_name,
                use_search=True,
                url_context=True
            )
            for _ in range(num_vote)
        ]
        responses = await asyncio.gather(*tasks)

        # -------- post-process each response -------------------------
        for i, response in enumerate(responses):
            resp_key = f"response {i+1}"

            # -- 1. Transport / server-side errors (string starting "Error:")
            if isinstance(response, str) and response.startswith("Error:"):
                record["llm_reponses"][resp_key] = {
                    "error": response                       # e.g. "Error: 429 Rate limit …"
                }
                continue

            # -- 2. Empty / malformed response objects
            if not hasattr(response, "candidates") or not response.candidates:
                record["llm_reponses"][resp_key] = {
                    "error": "No candidates returned",
                    "raw_response": str(response)
                }
                continue

            # -- 3. Extract main answer text
            try:
                text = response.candidates[0].content.parts[0].text
            except Exception as e:
                record["llm_reponses"][resp_key] = {
                    "error": f"Cannot parse text: {e}",
                    "raw_response": str(response)
                }
                continue

            # -- 4. Extract additional metadata (best-effort)
            try:
                url_context = str(response.candidates[0].url_context_metadata)
            except Exception:
                url_context = "Not used"

            try:
                search_pages = (
                    f"Search Chunks: "
                    f"{response.candidates[0].grounding_metadata.grounding_chunks}"
                )
            except Exception:
                search_pages = "Not used"

            try:
                search_queries = (
                    f"Search Query: "
                    f"{response.candidates[0].grounding_metadata.web_search_queries}"
                )
            except Exception:
                search_queries = "Not used"

            try:
                search_support = (
                    f"Search Supports: "
                    f"{response.candidates[0].grounding_metadata.groundingSupports}"
                )
            except Exception:
                search_support = "Not used"

            # -- 5. Store normal answer + metadata + raw object
            record["llm_reponses"][resp_key] = {
                "response_text": text,
                "url_context": url_context,
                "search_queries": search_queries,
                "search_pages": search_pages,
                "search_support": search_support,
                "raw_response": str(response)             # keep for deep-debugging
            }

        return record

# ---------------------------------------------------------------------------
# Batch orchestrator with tqdm progress bar
# ---------------------------------------------------------------------------
async def request_and_store_async(prompt_template,
                                  field_df,
                                  num_vote: int,
                                  model_name: str,
                                  start_from: int = 0,
                                  end_at: int = -1):
    """
    Runs `process_row` over the dataframe slice asynchronously,
    shows a live tqdm bar, and dumps the results to JSON.
    """
    df = field_df.copy()[start_from:end_at]

    # Spawn tasks for every row in the slice
    tasks = [
        process_row(row, prompt_template, num_vote, model_name)
        for _, row in df.iterrows()
    ]

    # tqdm_asyncio.gather gives us progress updates as tasks complete
    response_records = await tqdm_asyncio.gather(*tasks)

    # Persist to disk ------------------------------------------------
    output_dir = f"../fields_records/{field_name}"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/{field_name}_{model_name}_{start_from}_{end_at}.json"

    with open(output_path, "w") as f:
        json.dump(response_records, f, ensure_ascii=False, indent=2)

    return response_records

In [4]:
import nest_asyncio
nest_asyncio.apply()  # Only needed in Jupyter

num_vote = 3
start_from = 0
end_at = len(field_df)
model_name = "gemini-2.5-flash"
response_records = asyncio.run(
    request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=start_from, end_at=end_at)
)

  0%|          | 0/454 [00:00<?, ?it/s]

100%|██████████| 454/454 [1:08:13<00:00,  9.02s/it]


In [5]:
input_json_path = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/Major/Math/fields_records/课程网址/课程网址_gemini-2.5-flash_0_454.json"
output_json_path = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/Major/Math/fields_records/课程网址/课程网址_gemini-2.5-flash_0_454_fixed.json"

In [6]:
import json
import pandas as pd
import requests
import asyncio
import aiohttp
import random
import time
from urllib.parse import urlparse
import re
import os
from tqdm.asyncio import tqdm_asyncio
import sys
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/Major/Math')
from call_api import async_call_gemini

class CurriculumURLValidator:
    """课程网址验证和修复器"""
    
    def __init__(self, max_retries=2, timeout=15):
        self.max_retries = max_retries
        self.timeout = timeout
        self.semaphore = asyncio.Semaphore(2)  # 控制并发
        
    def is_valid_curriculum_url_response(self, response_text):
        """
        检查回答是否为有效的课程网址（只检查URL格式，不限制域名）
        
        Returns:
            tuple: (is_valid, reason)
        """
        if not response_text or response_text.strip() == "":
            return False, "空回答"
        
        # 无效回答模式
        invalid_patterns = [
            r"not found",
            r"not exist", 
            r"no curriculum",
            r"unable to find",
            r"cannot find",
            r"error",
            r"404",
            r"not available"
        ]
        
        text_lower = response_text.lower()
        for pattern in invalid_patterns:
            if re.search(pattern, text_lower):
                return False, f"包含无效模式: {pattern}"
        
        # 检查是否包含有效URL
        url_pattern = r'https?://[^\s]+'
        urls = re.findall(url_pattern, response_text)
        
        if not urls:
            return False, "未找到URL"
        
        # 验证URL格式是否有效
        valid_urls = []
        for url in urls:
            try:
                parsed = urlparse(url)
                # 只检查是否有域名和协议，不限制.edu
                if parsed.scheme in ['http', 'https'] and parsed.netloc:
                    valid_urls.append(url)
            except:
                continue
        
        if not valid_urls:
            return False, "未找到有效格式的URL"
        
        # 检查是否为过长的描述性回答（超过300字符且没有明确URL）
        if len(response_text) > 300 and not any(url in response_text for url in valid_urls):
            return False, "回答过长且缺乏明确URL"
        
        return True, "有效回答"
    
    async def fetch_url_with_enhanced_retry(self, url, max_retries=4):
        """
        增强的URL获取函数，支持多次重试和指数退避
        
        Args:
            url (str): 要获取的URL
            max_retries (int): 最大重试次数
        
        Returns:
            tuple: (success, content_or_error, status_info)
        """
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }
        
        for attempt in range(1, max_retries + 1):
            try:
                # 使用aiohttp进行异步请求
                async with aiohttp.ClientSession(
                    timeout=aiohttp.ClientTimeout(total=self.timeout),
                    headers=headers
                ) as session:
                    async with session.get(url) as response:
                        if response.status == 200:
                            content = await response.text()
                            if len(content) > 500:  # 确保内容足够长
                                print(f"      ✅ 成功获取 (尝试 {attempt}/{max_retries})")
                                return True, content, f"成功 (尝试 {attempt})"
                            else:
                                print(f"      ⚠️ 内容过短 ({len(content)} 字符)")
                                return False, f"内容过短: {len(content)} 字符", f"内容过短 (尝试 {attempt})"
                        elif response.status == 404:
                            print(f"      ❌ 页面不存在 (HTTP {response.status})")
                            return False, f"HTTP {response.status}", "永久性错误: 页面不存在"
                        elif response.status in [403, 401]:
                            print(f"      ❌ 访问被拒绝 (HTTP {response.status})")
                            return False, f"HTTP {response.status}", "永久性错误: 访问被拒绝"
                        else:
                            print(f"      ❌ 尝试 {attempt}/{max_retries}: HTTP {response.status}")
                            
            except asyncio.TimeoutError:
                print(f"      ❌ 尝试 {attempt}/{max_retries}: 请求超时 ({self.timeout}秒)")
            except aiohttp.ClientError as e:
                print(f"      ❌ 尝试 {attempt}/{max_retries}: 连接错误: {str(e)}")
            except Exception as e:
                print(f"      ❌ 尝试 {attempt}/{max_retries}: 未知错误: {str(e)}")
            
            # 如果不是最后一次尝试，等待后重试
            if attempt < max_retries:
                wait_time = random.uniform(2 ** attempt, 2 ** (attempt + 1))
                print(f"      ⏳ 等待 {wait_time:.1f} 秒后重试...")
                await asyncio.sleep(wait_time)
        
        return False, "所有尝试失败", f"最终失败: 请求超时 ({self.timeout}秒)"

    async def retry_invalid_response(self, program_info, response_key, prompt_template, model_name="gemini-2.5-flash"):
        """
        重试无效的回答
        
        Args:
            program_info (dict): 项目信息
            response_key (str): 回答的key (如 "response 1")
            prompt_template (str): 提示词模板
            model_name (str): 模型名称
        
        Returns:
            dict: 新的回答数据
        """
        async with self.semaphore:
            print(f"      🔄 重新请求 {response_key}...")
            
            # 格式化提示词
            prompt = prompt_template.format(
                university=program_info.get("大学英文名称", ""),
                degree=program_info.get("学位", ""),
                program=program_info.get("专业英文名称", ""),
                department=program_info.get("所属院系（英文）", ""),
                admissions_url=program_info.get("招生网址", ""),
                program_url=program_info.get("专业网址", ""),
            )
            
            try:
                response = await async_call_gemini(
                    prompt,
                    model_name=model_name,
                    use_search=True,
                    url_context=True
                )
                
                # 处理响应
                if isinstance(response, str) and response.startswith("Error:"):
                    return {
                        "error": response,
                        "retry_status": "API错误"
                    }
                
                if not hasattr(response, "candidates") or not response.candidates:
                    return {
                        "error": "No candidates returned",
                        "retry_status": "无候选回答"
                    }
                
                try:
                    text = response.candidates[0].content.parts[0].text
                except Exception as e:
                    return {
                        "error": f"Cannot parse text: {e}",
                        "retry_status": "解析错误"
                    }
                
                # 获取元数据
                try:
                    url_context = str(response.candidates[0].url_context_metadata)
                except:
                    url_context = "Not used"
                
                try:
                    search_queries = f"Search Query: {response.candidates[0].grounding_metadata.web_search_queries}"
                except:
                    search_queries = "Not used"
                
                try:
                    search_pages = f"Search Chunks: {response.candidates[0].grounding_metadata.grounding_chunks}"
                except:
                    search_pages = "Not used"
                
                try:
                    search_support = f"Search Supports: {response.candidates[0].grounding_metadata.groundingSupports}"
                except:
                    search_support = "Not used"
                
                return {
                    "response_text": text,
                    "url_context": url_context,
                    "search_queries": search_queries,
                    "search_pages": search_pages,
                    "search_support": search_support,
                    "raw_response": str(response),
                    "retry_status": "重试成功"
                }
                
            except Exception as e:
                return {
                    "error": f"请求失败: {str(e)}",
                    "retry_status": "请求失败"
                }

    async def fix_invalid_curriculum_responses(self, json_file_path, output_json_path, prompt_template):
        """
        修复JSON文件中的无效课程网址回答
        
        Args:
            json_file_path (str): 输入JSON文件路径
            output_json_path (str): 输出JSON文件路径
            prompt_template (str): 提示词模板
        
        Returns:
            dict: 处理统计信息
        """
        # 读取JSON文件
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        stats = {
            'total_programs': len(data),
            'total_responses': 0,
            'invalid_responses': 0,
            'retry_attempts': 0,
            'successful_fixes': 0,
            'failed_fixes': 0,
            'processing_details': []
        }
        
        print(f"🔧 开始修复课程网址回答...")
        print(f"📊 总项目数: {len(data)}")
        print("=" * 60)
        
        for i, record in enumerate(data, 1):
            university = record.get('大学英文名称', '')
            degree = record.get('学位', '')
            program = record.get('专业英文名称', '')
            
            print(f"\n[{i}/{len(data)}] 检查: {university} - {degree} {program}")
            
            llm_responses = record.get('llm_reponses', {})
            program_stats = {
                'program_id': i,
                'university': university,
                'program': program,
                'invalid_responses': [],
                'retry_results': []
            }
            
            # 检查每个回答
            responses_to_fix = []
            for response_key in ['response 1', 'response 2', 'response 3']:
                stats['total_responses'] += 1
                
                if response_key not in llm_responses:
                    print(f"   ⚠️ {response_key}: 缺失")
                    continue
                
                response_data = llm_responses[response_key]
                
                # 检查是否已经是错误状态
                if 'error' in response_data:
                    print(f"   ❌ {response_key}: 已有错误 - {response_data['error']}")
                    responses_to_fix.append(response_key)
                    stats['invalid_responses'] += 1
                    program_stats['invalid_responses'].append({
                        'response_key': response_key,
                        'reason': 'API错误'
                    })
                    continue
                
                # 检查response_text是否有效
                response_text = response_data.get('response_text', '')
                is_valid, reason = self.is_valid_curriculum_url_response(response_text)
                
                if not is_valid:
                    print(f"   ❌ {response_key}: 无效 - {reason}")
                    responses_to_fix.append(response_key)
                    stats['invalid_responses'] += 1
                    program_stats['invalid_responses'].append({
                        'response_key': response_key,
                        'reason': reason
                    })
                else:
                    print(f"   ✅ {response_key}: 有效")
            
            # 修复无效回答
            if responses_to_fix:
                print(f"   🔧 需要修复 {len(responses_to_fix)} 个回答")
                
                for response_key in responses_to_fix:
                    retry_count = 0
                    success = False
                    
                    while retry_count < self.max_retries and not success:
                        retry_count += 1
                        stats['retry_attempts'] += 1
                        
                        print(f"      📝 重试 {response_key} (第 {retry_count}/{self.max_retries} 次)")
                        
                        new_response = await self.retry_invalid_response(
                            record, response_key, prompt_template
                        )
                        
                        # 检查新回答是否有效
                        if 'error' in new_response:
                            print(f"         ❌ 重试失败: {new_response['error']}")
                            program_stats['retry_results'].append({
                                'response_key': response_key,
                                'attempt': retry_count,
                                'status': 'failed',
                                'reason': new_response['error']
                            })
                        else:
                            new_text = new_response.get('response_text', '')
                            is_valid, reason = self.is_valid_curriculum_url_response(new_text)
                            
                            if is_valid:
                                print(f"         ✅ 重试成功: 获得有效回答")
                                # 更新记录
                                llm_responses[response_key] = new_response
                                success = True
                                stats['successful_fixes'] += 1
                                program_stats['retry_results'].append({
                                    'response_key': response_key,
                                    'attempt': retry_count,
                                    'status': 'success',
                                    'reason': '获得有效回答'
                                })
                            else:
                                print(f"         ❌ 重试仍无效: {reason}")
                                program_stats['retry_results'].append({
                                    'response_key': response_key,
                                    'attempt': retry_count,
                                    'status': 'invalid',
                                    'reason': reason
                                })
                        
                        # 等待一下再进行下一次重试
                        if retry_count < self.max_retries and not success:
                            wait_time = random.uniform(1, 3)
                            await asyncio.sleep(wait_time)
                    
                    if not success:
                        stats['failed_fixes'] += 1
                        print(f"      ❌ {response_key}: 重试全部失败")
            else:
                print(f"   ✅ 所有回答都有效，无需修复")
            
            stats['processing_details'].append(program_stats)
            print("-" * 40)
        
        # 保存修复后的JSON
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        
        # 输出统计信息
        print(f"\n📊 修复完成！")
        print("=" * 60)
        print(f"总项目数: {stats['total_programs']}")
        print(f"总回答数: {stats['total_responses']}")
        print(f"无效回答数: {stats['invalid_responses']}")
        print(f"重试尝试次数: {stats['retry_attempts']}")
        print(f"成功修复: {stats['successful_fixes']}")
        print(f"修复失败: {stats['failed_fixes']}")
        
        if stats['invalid_responses'] > 0:
            success_rate = stats['successful_fixes'] / stats['invalid_responses'] * 100
            print(f"修复成功率: {success_rate:.1f}%")
        
        print(f"\n📄 修复后的JSON已保存至: {output_json_path}")
        
        return stats

# 使用示例
async def main():
    # 定义提示词模板（与你原来的一致）
    prompt_template = """
You are an assistant whose only task is to return the **URL of the official page that lists this program's curriculum / required and elective courses**.

────────────────────────────────────────────────────────
How to find the correct URL

1. **Primary *.edu* sources only**  
   • Examine the Admissions and Program URLs provided.  
   • You may open other pages within the same university's *.edu* domain (e.g., "Curriculum", "Course List", "Degree Requirements", "Program Handbook").  
   ⛔  Ignore non-*.edu* sites, PDFs without a stable web address, blogs, or rankings.

2. **Optional Google search**  
   Query once:  
   "{university} {department} {degree} {program} curriculum courses degree requirements site:.edu"  
   Follow *.edu* results until you locate the page that explicitly lists the program's required and elective courses.

3. **Selection rules**  
   • Choose **one** URL (https://…) that best displays the full course list for this program.  
   • If several pages exist, prefer the most specific (program-level) page over school-wide catalogs.  
   • Do **not** perform any summarization or explanation.  
   • If no suitable *.edu* URL is found, output **Not found**.  
   • Never invent a link or return a invalid link

────────────────────────────────────────────────────────
⚠️  Output format (exactly one line, no quotes, no extra text)  

Valid examples:  
https://engineering.university.edu/department/ms-data-science/curriculum  
Not found  

────────────────────────────────────────────────────────
Pages to consult first:
• Admissions URL: {admissions_url}  
• Program URL:    {program_url}

What is the URL that shows the program's course requirements?
"""
    
    # 文件路径
    # 创建验证器
    validator = CurriculumURLValidator(max_retries=1, timeout=15)
    
    # 修复无效回答
    stats = await validator.fix_invalid_curriculum_responses(
        input_json_path, 
        output_json_path, 
        prompt_template
    )
    
    return stats

# 运行示例
stats = await main()

🔧 开始修复课程网址回答...
📊 总项目数: 454

[1/454] 检查: Princeton University - Ph.D.  Applied and Computational Mathematics
   ❌ response 1: 已有错误 - Error: 400 FAILED_PRECONDITION. {'error': {'code': 400, 'message': 'User location is not supported for the API use.', 'status': 'FAILED_PRECONDITION'}}
   ❌ response 2: 已有错误 - Error: 400 FAILED_PRECONDITION. {'error': {'code': 400, 'message': 'User location is not supported for the API use.', 'status': 'FAILED_PRECONDITION'}}
   ❌ response 3: 已有错误 - Error: 400 FAILED_PRECONDITION. {'error': {'code': 400, 'message': 'User location is not supported for the API use.', 'status': 'FAILED_PRECONDITION'}}
   🔧 需要修复 3 个回答
      📝 重试 response 1 (第 1/1 次)
      🔄 重新请求 response 1...
         ✅ 重试成功: 获得有效回答
      📝 重试 response 2 (第 1/1 次)
      🔄 重新请求 response 2...
         ✅ 重试成功: 获得有效回答
      📝 重试 response 3 (第 1/1 次)
      🔄 重新请求 response 3...
         ✅ 重试成功: 获得有效回答
----------------------------------------

[2/454] 检查: Princeton University - Ph.D.  Mathematics
  

In [7]:
import json
import pandas as pd
import requests
import re
import asyncio
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import sys
import os
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/Major/Math')

from course_web_content import analyze_curriculum_content_enhanced

def is_valid_url(text):
    """检查文本是否为有效的URL格式"""
    if not text or text.lower() in ['not found', 'not exist', '']:
        return False
    
    # 检查是否包含URL模式
    url_pattern = r'https?://[^\s]+'
    return bool(re.search(url_pattern, text))

def clean_url(text):
    """从文本中提取URL"""
    if not text:
        return None
    
    # 如果是无效回答，直接跳过
    invalid_responses = ['not found', 'not exist', 'no curriculum found', 'error']
    if any(invalid in text.lower() for invalid in invalid_responses):
        return None
    
    # 提取URL
    url_pattern = r'https?://[^\s]+'
    urls = re.findall(url_pattern, text)
    return urls[0] if urls else None

async def fetch_url_content(url, timeout=10):
    """异步获取URL内容"""
    try:
        import aiohttp
        async with aiohttp.ClientSession() as session:
            async with session.get(url, timeout=timeout, headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }) as response:
                if response.status == 200:
                    content = await response.text()
                    return content
                else:
                    return None
    except:
        # 如果aiohttp不可用，使用同步requests
        try:
            response = requests.get(url, timeout=timeout, headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            })
            if response.status_code == 200:
                return response.text
            else:
                return None
        except:
            return None

def choose_best_curriculum_url(urls_with_content):
    """基于规则选择最佳课程URL"""
    if not urls_with_content:
        return None, "无有效URL"
    
    if len(urls_with_content) == 1:
        url = list(urls_with_content.keys())[0]
        analysis = analyze_curriculum_content_enhanced(urls_with_content[url], is_url=False)
        if analysis["is_curriculum"]:
            return url, f"单一URL验证通过 (置信度: {analysis['confidence_score']})"
        else:
            return None, f"单一URL验证失败 (置信度: {analysis['confidence_score']})"
    
    # 分析每个URL
    url_analyses = {}
    for url, content in urls_with_content.items():
        print(f"   🔍 分析URL: {url}")
        analysis = analyze_curriculum_content_enhanced(content, is_url=False)
        url_analyses[url] = analysis
        
        print(f"      置信度: {analysis['confidence_score']} ({analysis.get('confidence_level', 'N/A')})")
        if analysis["is_curriculum"]:
            print(f"      ✅ 识别为课程页面")
            if analysis.get("course_codes_found"):
                print(f"      📚 课程编号: {', '.join(analysis['course_codes_found'][:3])}")
        else:
            print(f"      ❌ 非课程页面")
    
    # 过滤出被识别为课程页面的URL
    curriculum_urls = {url: analysis for url, analysis in url_analyses.items() 
                      if analysis["is_curriculum"]}
    
    if not curriculum_urls:
        # 没有URL被识别为课程页面，选择置信度最高的
        best_url = max(url_analyses.keys(), key=lambda x: url_analyses[x]["confidence_score"])
        best_analysis = url_analyses[best_url]
        return None, f"无URL达到课程页面标准，最高置信度: {best_analysis['confidence_score']}"
    
    # 选择置信度最高的课程URL
    best_url = max(curriculum_urls.keys(), key=lambda x: curriculum_urls[x]["confidence_score"])
    best_analysis = curriculum_urls[best_url]
    
    # 构建详细的选择理由
    reasons = []
    reasons.append(f"置信度: {best_analysis['confidence_score']}")
    
    if best_analysis.get("course_codes_found"):
        reasons.append(f"发现{len(best_analysis['course_codes_found'])}个课程编号")
    
    if best_analysis.get("subject_areas"):
        reasons.append(f"学科: {', '.join(best_analysis['subject_areas'])}")
    
    selection_reason = f"规则选择完成 ({'; '.join(reasons)})"
    
    return best_url, selection_reason

async def process_curriculum_urls_enhanced(json_file_path):
    """增强版处理课程网址JSON文件的主函数"""
    
    # 读取JSON文件
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    results = []
    
    print("🎓 开始处理课程网址...")
    print("=" * 60)
    
    for i, record in enumerate(data, 1):
        # 提取基本信息
        university = record.get('大学英文名称', '')
        degree = record.get('学位', '')
        program = record.get('专业英文名称', '')
        
        print(f"\n[{i}/{len(data)}] 处理: {university} - {degree} {program}")
        
        result = {
            '大学英文名称': university,
            '学位': degree,
            '专业英文名称': program,
            '所属院系': record.get('所属院系', ''),
            '招生网址': record.get('招生网址', ''),
            '专业网址': record.get('专业网址', ''),
            '课程网址': '',
            '处理状态': '',
            '有效URL数量': 0,
            '置信度分数': 0,
            '发现课程编号': ''
        }
        
        # 提取3个response中的URL
        urls = []
        llm_responses = record.get('llm_reponses', {})
        
        for j in range(1, 4):
            response_key = f'response {j}'
            if response_key in llm_responses:
                response_data = llm_responses[response_key]
                if 'response_text' in response_data:
                    url = clean_url(response_data['response_text'])
                    if url and url not in urls:
                        urls.append(url)
        
        print(f"📋 发现URL: {len(urls)} 个")
        for j, url in enumerate(urls, 1):
            print(f"   {j}. {url}")
        
        if not urls:
            result['处理状态'] = '无有效URL'
            results.append(result)
            print("❌ 无有效URL")
            continue
        
        # 获取URL内容
        urls_with_content = {}
        print(f"🌐 开始获取URL内容...")
        
        for url in urls:
            print(f"   📥 获取: {url}")
            content = await fetch_url_content(url)
            if content:
                # 简单验证内容是否有效
                if len(content) > 500 and '404' not in content.lower():
                    urls_with_content[url] = content
                    print(f"      ✅ 内容获取成功 ({len(content)} 字符)")
                else:
                    print(f"      ❌ 内容无效或过短")
            else:
                print(f"      ❌ 获取失败")
        
        result['有效URL数量'] = len(urls_with_content)
        
        if not urls_with_content:
            result['处理状态'] = '所有URL内容获取失败'
            results.append(result)
            print("❌ 所有URL内容获取失败")
            continue
        
        # 使用规则选择最佳URL
        print(f"🤖 使用规则分析 {len(urls_with_content)} 个有效URL...")
        best_url, selection_reason = choose_best_curriculum_url(urls_with_content)
        
        if best_url:
            result['课程网址'] = best_url
            result['处理状态'] = selection_reason
            
            # 获取详细分析信息
            analysis = analyze_curriculum_content_enhanced(urls_with_content[best_url], is_url=False)
            result['置信度分数'] = analysis['confidence_score']
            if analysis.get('course_codes_found'):
                result['发现课程编号'] = '; '.join(analysis['course_codes_found'][:5])
            
            print(f"✅ 选中URL: {best_url}")
            print(f"   {selection_reason}")
        else:
            result['课程网址'] = ''
            result['处理状态'] = selection_reason
            print(f"❌ 未找到合适的课程URL")
            print(f"   {selection_reason}")
        
        results.append(result)
        print("-" * 40)
    
    # 保存结果
    output_dir = os.path.dirname(json_file_path)
    output_filename = '课程网址_规则处理结果.csv'
    output_path = os.path.join(output_dir, output_filename)
    
    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    
    # 统计结果
    total_processed = len(results)
    found_curriculum = sum(1 for r in results if r['课程网址'])
    avg_confidence = sum(r['置信度分数'] for r in results if r['置信度分数']) / max(found_curriculum, 1)
    
    print(f"\n📊 处理完成！")
    print(f"=" * 60)
    print(f"总计处理记录: {total_processed}")
    print(f"成功找到课程网址: {found_curriculum} ({found_curriculum/total_processed*100:.1f}%)")
    print(f"平均置信度分数: {avg_confidence:.1f}")
    print(f"结果保存至: {output_path}")
    
    # 显示一些统计信息
    status_counts = {}
    for result in results:
        status = result['处理状态'].split('(')[0].strip()  # 只取状态的主要部分
        status_counts[status] = status_counts.get(status, 0) + 1
    
    print(f"\n📈 处理状态统计:")
    for status, count in sorted(status_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"   {status}: {count} 条")
    
    return df

# 快速处理函数（无详细输出）
async def process_curriculum_urls_silent(json_file_path):
    """静默模式处理（适合大批量数据）"""
    
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    results = []
    
    for record in data:
        university = record.get('大学英文名称', '')
        degree = record.get('学位', '')
        program = record.get('专业英文名称', '')
        
        result = {
            '大学英文名称': university,
            '学位': degree,
            '专业英文名称': program,
            '所属院系': record.get('所属院系', ''),
            '招生网址': record.get('招生网址', ''),
            '专业网址': record.get('专业网址', ''),
            '课程网址': '',
            '处理状态': '',
            '置信度分数': 0
        }
        
        # 提取URLs
        urls = []
        llm_responses = record.get('llm_reponses', {})
        for i in range(1, 4):
            response_key = f'response {i}'
            if response_key in llm_responses and 'response_text' in llm_responses[response_key]:
                url = clean_url(llm_responses[response_key]['response_text'])
                if url and url not in urls:
                    urls.append(url)
        
        if not urls:
            result['处理状态'] = '无有效URL'
            results.append(result)
            continue
        
        # 获取内容并分析
        urls_with_content = {}
        for url in urls:
            content = await fetch_url_content(url)
            if content and len(content) > 500:
                urls_with_content[url] = content
        
        if urls_with_content:
            best_url, selection_reason = choose_best_curriculum_url(urls_with_content)
            if best_url:
                result['课程网址'] = best_url
                result['处理状态'] = '规则选择成功'
                analysis = analyze_curriculum_content_enhanced(urls_with_content[best_url], is_url=False)
                result['置信度分数'] = analysis['confidence_score']
            else:
                result['处理状态'] = '未找到合适URL'
        else:
            result['处理状态'] = 'URL内容获取失败'
        
        results.append(result)
    
    # 保存结果
    output_dir = os.path.dirname(json_file_path)
    output_filename = '课程网址_规则处理结果_fixed.csv'
    output_path = os.path.join(output_dir, output_filename)
    
    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    
    return df

# 使用示例
async def main():
    json_file_path = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/Major/Math/fields_records/课程网址/课程网址_gemini-2.5-flash_0_454_fixed.json"
    
    # 详细模式处理
    df = await process_curriculum_urls_enhanced(json_file_path)
    
    # 或者使用静默模式（适合大量数据）
    # df = await process_curriculum_urls_silent(json_file_path)
    
    print(f"\n前5条结果预览:")
    print(df[['大学英文名称', '专业英文名称', '课程网址', '处理状态', '置信度分数']].head())

await main()

🎓 开始处理课程网址...

[1/454] 处理: Princeton University - Ph.D.  Applied and Computational Mathematics
📋 发现URL: 1 个
   1. https://gradschool.princeton.edu/academics/degrees-requirements/fields-study/applied-and-computational-math
🌐 开始获取URL内容...
   📥 获取: https://gradschool.princeton.edu/academics/degrees-requirements/fields-study/applied-and-computational-math
      ❌ 获取失败
❌ 所有URL内容获取失败

[2/454] 处理: Princeton University - Ph.D.  Mathematics
📋 发现URL: 1 个
   1. https://gradschool.princeton.edu/academics/degrees-requirements/fields-study/mathematics
🌐 开始获取URL内容...
   📥 获取: https://gradschool.princeton.edu/academics/degrees-requirements/fields-study/mathematics
      ❌ 获取失败
❌ 所有URL内容获取失败

[3/454] 处理: Massachusetts Institute of Technology - PhD Mathematics
📋 发现URL: 2 个
   1. https://catalog.mit.edu/graduate/departments/mathematics/doctorofphilosophy/
   2. https://catalog.mit.edu/graduate/schools/science/mathematics/doctor-philosophy-mathematics/
🌐 开始获取URL内容...
   📥 获取: https://catalog.mit.edu/gradu