In [1]:
import pandas as pd
import sys
import os
import json
from tqdm import tqdm
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience')
import asyncio
from call_api import call_gemini, async_call_gemini
from tqdm.asyncio import tqdm_asyncio

field_name = "Capstone或Thesis"
field_path = f"/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_csv/{field_name}.csv"
field_df = pd.read_csv(field_path)

field_df.shape

(351, 7)

In [2]:
prompt_template = """
You are an assistant whose single task is to identify **all graduation requirements that go beyond regular coursework** for the graduate program below.

These non-coursework requirements may include, but are not limited to:
• Capstone project • Research thesis • Comprehensive/qualifying exam  
• Internship / practicum hours • Mandatory fieldwork or research‐lab hours  
• Professional portfolio • Residency / clinical experience

────────────────────────────────────────────────────────
🔍  How to locate the information

1. **Use primary university sources only**  
   • Follow the Admissions and Program URLs given.  
   • You may open additional pages that live under the same university’s *.edu* domain (e.g., “Degree Requirements”, “Program Handbook”, “Curriculum”).  
   ⛔  Ignore every non-*.edu* website, news article, or secondary aggregate.

2. **Search guideline (if needed)**  
   Run a single Google search:  
   "{university} {degree} {program} graduation requirement capstone thesis practicum exam site:.edu"  
   Check only *.edu* results until you find an authoritative statement of requirements.

3. **What to output**  
   • Summarize the requirement(s) in **one concise sentence**—no quotes, no explanations.  
   • If multiple options exist, describe them briefly (e.g., “Capstone or thesis option”).  
   • If the program states there are *no* additional requirements, say so (e.g., “No additional graduation requirement”).  
   • If you do **not** find a clear statement from an *.edu* source, output **Not found**—never guess or invent.

────────────────────────────────────────────────────────
⚠️  **Output format (one line, no quotes, no extra text)**  
Valid examples:

Capstone project required  
Research thesis mandatory  
Capstone or thesis option available  
Industry internship of at least 400 hours required  
Comprehensive exam required  
No additional graduation requirement  
Not found  

────────────────────────────────────────────────────────
Pages to consult first:
• Admissions URL: {admissions_url}  
• Program URL:    {program_url}

What non-coursework graduation requirement applies to this program?
"""


In [3]:
import os
import json
import asyncio
from tqdm.asyncio import tqdm_asyncio

# Async Gemini wrapper
from call_api import async_call_gemini

# ---------------------------------------------------------------------------
# Concurrency guard – avoid hitting rate-limits
# ---------------------------------------------------------------------------
semaphore = asyncio.Semaphore(2)            # max concurrent rows

# ---------------------------------------------------------------------------
# Per-row worker
# ---------------------------------------------------------------------------
async def process_row(row, prompt_template, num_vote: int, model_name: str):
    """
    1. Format the prompt for this row
    2. Launch `num_vote` Gemini calls in parallel
    3. Capture BOTH normal answers *and* every possible error case
    4. Return a serialisable record
    """
    async with semaphore:
        row    = row.to_dict()
        prompt = prompt_template.format(
            university     = row["大学英文名称"],
            degree         = row["学位"],
            program        = row["专业英文名称"],
            department     = row["所属院系"],
            admissions_url = row["招生网址"],
            program_url    = row["专业网址"],
        )

        record: dict = row.copy()
        record["llm_reponses"] = {}

        # -------- launch Gemini calls in parallel --------------------
        tasks = [
            async_call_gemini(
                prompt,
                model_name=model_name,
                use_search=True,
                url_context=True
            )
            for _ in range(num_vote)
        ]
        responses = await asyncio.gather(*tasks)

        # -------- post-process each response -------------------------
        for i, response in enumerate(responses):
            resp_key = f"response {i+1}"

            # -- 1. Transport / server-side errors (string starting "Error:")
            if isinstance(response, str) and response.startswith("Error:"):
                record["llm_reponses"][resp_key] = {
                    "error": response                       # e.g. "Error: 429 Rate limit …"
                }
                continue

            # -- 2. Empty / malformed response objects
            if not hasattr(response, "candidates") or not response.candidates:
                record["llm_reponses"][resp_key] = {
                    "error": "No candidates returned",
                    "raw_response": str(response)
                }
                continue

            # -- 3. Extract main answer text
            try:
                text = response.candidates[0].content.parts[0].text
            except Exception as e:
                record["llm_reponses"][resp_key] = {
                    "error": f"Cannot parse text: {e}",
                    "raw_response": str(response)
                }
                continue

            # -- 4. Extract additional metadata (best-effort)
            try:
                url_context = str(response.candidates[0].url_context_metadata)
            except Exception:
                url_context = "Not used"

            try:
                search_pages = (
                    f"Search Chunks: "
                    f"{response.candidates[0].grounding_metadata.grounding_chunks}"
                )
            except Exception:
                search_pages = "Not used"

            try:
                search_queries = (
                    f"Search Query: "
                    f"{response.candidates[0].grounding_metadata.web_search_queries}"
                )
            except Exception:
                search_queries = "Not used"

            try:
                search_support = (
                    f"Search Supports: "
                    f"{response.candidates[0].grounding_metadata.groundingSupports}"
                )
            except Exception:
                search_support = "Not used"

            # -- 5. Store normal answer + metadata + raw object
            record["llm_reponses"][resp_key] = {
                "response_text": text,
                "url_context": url_context,
                "search_queries": search_queries,
                "search_pages": search_pages,
                "search_support": search_support,
                "raw_response": str(response)             # keep for deep-debugging
            }

        return record

# ---------------------------------------------------------------------------
# Batch orchestrator with tqdm progress bar
# ---------------------------------------------------------------------------
async def request_and_store_async(prompt_template,
                                  field_df,
                                  num_vote: int,
                                  model_name: str,
                                  start_from: int = 0,
                                  end_at: int = -1):
    """
    Runs `process_row` over the dataframe slice asynchronously,
    shows a live tqdm bar, and dumps the results to JSON.
    """
    df = field_df.copy()[start_from:end_at]

    # Spawn tasks for every row in the slice
    tasks = [
        process_row(row, prompt_template, num_vote, model_name)
        for _, row in df.iterrows()
    ]

    # tqdm_asyncio.gather gives us progress updates as tasks complete
    response_records = await tqdm_asyncio.gather(*tasks)

    # Persist to disk ------------------------------------------------
    output_dir = f"../fields_records/{field_name}"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/{field_name}_{model_name}_{start_from}_{end_at}.json"

    with open(output_path, "w") as f:
        json.dump(response_records, f, ensure_ascii=False, indent=2)

    return response_records

In [4]:
import nest_asyncio
nest_asyncio.apply()  # Only needed in Jupyter

num_vote = 3
start_from = 0
end_at = -1
model_name = "gemini-2.5-flash"
response_records = asyncio.run(
    request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=start_from, end_at=end_at)
)

100%|██████████| 350/350 [48:56<00:00,  8.39s/it] 


In [3]:
import json
import os

def extract_responses_from_json(json_file_path, output_dir):
    """
    从JSON文件中提取所有response_text并打印，同时保存到txt文件
    
    Args:
        json_file_path (str): JSON文件路径
        output_dir (str): 输出目录路径
    """
    
    # 读取JSON文件
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)
    
    # 获取文件名（不含扩展名）
    file_name = os.path.basename(json_file_path).replace('.json', '')
    output_txt_path = os.path.join(output_dir, f"{file_name}_responses.txt")
    
    all_responses = []
    response_count = 0
    
    print(f"正在处理文件: {json_file_path}")
    print(f"总项目数: {len(data)}")
    print("\n" + "="*80)
    
    # 遍历所有项目
    for i, project in enumerate(data):
        university = project.get('大学英文名称', 'Unknown University')
        degree = project.get('学位', 'Unknown Degree')
        major = project.get('专业英文名称', 'Unknown Major')
        
        project_header = f"\n项目 {i+1}: {university} - {degree} - {major}"
        print(project_header)
        all_responses.append(project_header)
        
        # 获取LLM回答
        llm_responses = project.get('llm_reponses', {})
        
        # 遍历每个回答
        for resp_key in ['response 1', 'response 2', 'response 3']:
            if resp_key in llm_responses:
                resp_data = llm_responses[resp_key]
                response_text = resp_data.get('response_text', 'No response text')
                
                response_line = f"  {resp_key}: {response_text}"
                print(response_line)
                all_responses.append(response_line)
                response_count += 1
            else:
                missing_line = f"  {resp_key}: [缺失回答]"
                print(missing_line)
                all_responses.append(missing_line)
        
        print("-" * 60)
        all_responses.append("-" * 60)
    
    # 保存到txt文件
    with open(output_txt_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(all_responses))
    
    # 打印统计信息
    print(f"\n统计信息:")
    print(f"总项目数: {len(data)}")
    print(f"总回答数: {response_count}")
    print(f"平均每项目回答数: {response_count/len(data):.2f}")
    print(f"\n所有回答已保存到: {output_txt_path}")
    
    return output_txt_path

# 使用示例
json_file_path = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/Capstone或Thesis/Capstone或Thesis_gemini-2.5-flash_0_-1.json"
output_dir = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/Capstone或Thesis"

# 执行函数
output_file = extract_responses_from_json(json_file_path, output_dir)

正在处理文件: /Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/Capstone或Thesis/Capstone或Thesis_gemini-2.5-flash_0_-1.json
总项目数: 350


项目 1: Harvard University - MS - Data Science SEAS
  response 1: A Capstone research project or a master's thesis is required, along with a project poster presentation.
  response 2: A capstone project or a master's thesis is required.
  response 3: A research experience, fulfilled by either a capstone project or a master's thesis, and a data science project poster presentation are required.
------------------------------------------------------------

项目 2: Harvard University - MS - Data Science
  response 1: Capstone project required.
  response 2: Capstone project required.
  response 3: Capstone project required.
------------------------------------------------------------

项目 3: Stanford University - MS - Data Science
  response 1: A capstone project or practical component is required.
  response 2: No addit

In [7]:
import json
import pandas as pd
import asyncio
from tqdm.asyncio import tqdm_asyncio
import sys
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience')

from call_api import async_call_gemini

async def extract_final_graduation_requirement_with_gemini(json_file_path, output_csv_path, 
                                                         model_name="gemini-2.5-flash", 
                                                         semaphore_limit=2):
    """
    使用Gemini API从三个回答中提取最终的毕业要求
    
    Args:
        json_file_path (str): JSON文件路径
        output_csv_path (str): 输出CSV文件路径
        model_name (str): Gemini模型名称
        semaphore_limit (int): 并发限制
    
    Returns:
        pd.DataFrame: 包含提取结果的DataFrame
    """
    
    # 限制并发数
    semaphore = asyncio.Semaphore(semaphore_limit)
    
    # 构建prompt模板
    prompt_template = """
You are analyzing graduation requirements for a graduate program. You have been given 3 different responses about what graduation requirements exist beyond regular coursework for this program.

Here are the 3 responses:

Response 1: "{response1}"
Response 2: "{response2}"  
Response 3: "{response3}"

TASK:
Based on these 3 responses, determine what the actual graduation requirement is using majority voting principles. If 2 or more responses say the same thing, that should be your answer. If all responses are different, choose the most specific and detailed one.

INSTRUCTIONS:
1. Read all 3 responses carefully
2. Identify what graduation requirement (beyond coursework) each response mentions
3. Apply majority voting - if 2+ responses agree, use that answer
4. If responses conflict, choose the most detailed/specific one
5. Give your final answer as a clear, concise statement
6. If the resposnes are Not found, that means there is no additional graduation requirement.

OUTPUT FORMAT:
Provide your answer as a single clear sentence describing the graduation requirement. Examples:
- "Capstone project required"
- "Research thesis required" 
- "Comprehensive exam required"
- "No additional graduation requirement"
- "Internship required"
- "Thesis or capstone project option available"

Do not explain your reasoning or provide multiple sentences. Just give the final requirement in one clear sentence.

FINAL ANSWER:
"""

    async def process_single_project(project):
        """处理单个项目"""
        async with semaphore:
            # 提取回答
            llm_responses = project.get('llm_reponses', {})
            
            response1 = ""
            response2 = ""
            response3 = ""
            
            for i in range(1, 4):
                response_key = f"response {i}"
                if response_key in llm_responses:
                    response_text = llm_responses[response_key].get('response_text', '')
                    if i == 1:
                        response1 = response_text
                    elif i == 2:
                        response2 = response_text
                    elif i == 3:
                        response3 = response_text
            
            # 构建prompt
            prompt = prompt_template.format(
                response1=response1 or "No response provided",
                response2=response2 or "No response provided", 
                response3=response3 or "No response provided"
            )
            
            # 调用Gemini API
            try:
                response = await async_call_gemini(
                    prompt,
                    model_name=model_name,
                    use_search=False,
                    url_context=False
                )
                
                # 处理API响应
                if isinstance(response, str) and response.startswith("Error:"):
                    return {
                        'final_requirement': f'API_ERROR: {response}',
                        'raw_api_response': response,
                        'status': 'API_ERROR'
                    }
                
                if not hasattr(response, "candidates") or not response.candidates:
                    return {
                        'final_requirement': 'API_ERROR: No candidates returned',
                        'raw_api_response': 'No candidates returned',
                        'status': 'API_ERROR'
                    }
                
                # 提取回答文本
                try:
                    api_response_text = response.candidates[0].content.parts[0].text.strip()
                except Exception as e:
                    return {
                        'final_requirement': f'PARSE_ERROR: {e}',
                        'raw_api_response': str(response),
                        'status': 'PARSE_ERROR'
                    }
                
                # 提取最终答案（寻找"FINAL ANSWER:"后的内容）
                final_answer = api_response_text
                if "FINAL ANSWER:" in api_response_text:
                    final_answer = api_response_text.split("FINAL ANSWER:")[-1].strip()
                
                # 清理答案（移除多余的格式）
                final_answer = final_answer.replace('\n', ' ').strip()
                if final_answer.startswith('-'):
                    final_answer = final_answer[1:].strip()
                if final_answer.startswith('"') and final_answer.endswith('"'):
                    final_answer = final_answer[1:-1].strip()
                
                return {
                    'final_requirement': final_answer,
                    'raw_api_response': api_response_text,
                    'status': 'SUCCESS'
                }
                
            except Exception as e:
                return {
                    'final_requirement': f'EXCEPTION_ERROR: {e}',
                    'raw_api_response': '',
                    'status': 'EXCEPTION_ERROR'
                }
    
    # 读取JSON文件
    print(f"读取文件: {json_file_path}")
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"总项目数: {len(data)}")
    
    # 并行处理所有项目
    print("开始调用Gemini API提取最终毕业要求...")
    
    tasks = [process_single_project(project) for project in data]
    api_results = await tqdm_asyncio.gather(*tasks)
    
    # 整理结果
    results = []
    status_stats = {}
    
    for i, (project, api_result) in enumerate(zip(data, api_results)):
        # 提取基本信息
        university = project.get('大学英文名称', '')
        degree = project.get('学位', '')
        major = project.get('专业英文名称', '')
        school = project.get('所属院系', '')
        
        # 提取原始回答
        llm_responses = project.get('llm_reponses', {})
        original_responses = {}
        for j in range(1, 4):
            response_key = f"response {j}"
            if response_key in llm_responses:
                original_responses[f'原始回答{j}'] = llm_responses[response_key].get('response_text', '')
            else:
                original_responses[f'原始回答{j}'] = 'No response provided'
        
        # 统计状态
        status = api_result['status']
        if status not in status_stats:
            status_stats[status] = 0
        status_stats[status] += 1
        
        # 创建结果记录
        result_record = {
            '序号': i + 1,
            '大学英文名称': university,
            '学位': degree,
            '专业英文名称': major,
            '所属院系': school,
            'Gemini最终毕业要求': api_result['final_requirement'],
            '处理状态': status,
            'Gemini完整回答': api_result['raw_api_response']
        }
        
        # 添加原始回答
        result_record.update(original_responses)
        
        results.append(result_record)
    
    # 创建DataFrame并保存
    df = pd.DataFrame(results)
    df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
    
    # 打印统计信息
    print(f"\nGemini处理结果统计:")
    print(f"总项目数: {len(data)}")
    
    for status, count in status_stats.items():
        percentage = count / len(data) * 100
        print(f"{status}: {count} 个项目 ({percentage:.1f}%)")
    
    # 分析成功提取的要求类型
    successful_results = df[df['处理状态'] == 'SUCCESS']
    if len(successful_results) > 0:
        print(f"\n成功提取的毕业要求示例 (前10个):")
        for i, req in enumerate(successful_results['Gemini最终毕业要求'].head(10)):
            print(f"{i+1}. {req}")
    
    print(f"\n结果已保存到: {output_csv_path}")
    
    return df, status_stats

# 使用示例
async def main():
    """主函数"""
    json_file_path = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/Capstone或Thesis/Capstone或Thesis_gemini-2.5-flash_0_-1.json"
    output_csv_path = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/Capstone或Thesis/Capstone或Thesis.csv"
    
    df, stats = await extract_final_graduation_requirement_with_gemini(
        json_file_path=json_file_path,
        output_csv_path=output_csv_path,
        model_name="gemini-2.5-flash",
        semaphore_limit=2
    )
    
    return df, stats

# 在Jupyter中运行
if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()
    
    # 运行函数
    df, stats = asyncio.run(main())

读取文件: /Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/Capstone或Thesis/Capstone或Thesis_gemini-2.5-flash_0_-1.json
总项目数: 350
开始调用Gemini API提取最终毕业要求...


100%|██████████| 350/350 [12:48<00:00,  2.20s/it]


Gemini处理结果统计:
总项目数: 350
SUCCESS: 314 个项目 (89.7%)
PARSE_ERROR: 29 个项目 (8.3%)
API_ERROR: 7 个项目 (2.0%)

成功提取的毕业要求示例 (前10个):
1. A capstone project or a master's thesis, and a project poster presentation are required.
2. Capstone project required.
3. A capstone project is required as a practical component.
4. No additional graduation requirement
5. Qualifying evaluation, candidacy examination, and dissertation defense are required.
6. Practicum or thesis with a report and presentation required.
7. Students can choose from a final exam, a project with a written report and oral presentation, or a thesis with a written thesis and oral defense.
8. Industry internship and ECE Master's Success Weekly Seminar are required.
9. Capstone project required.
10. An industry internship and a master of engineering project with assessment are required.

结果已保存到: /Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/Capstone或Thesis/Capstone或Thesis.csv





In [9]:
import pandas as pd
import asyncio
from tqdm.asyncio import tqdm_asyncio
from call_api import async_call_gemini

async def fix_gemini_errors(csv_file_path, output_csv_path, 
                           model_name="gemini-2.5-flash", 
                           max_retries=2,
                           semaphore_limit=2):
    """
    修复Gemini API调用中的错误回答
    
    Args:
        csv_file_path (str): 上一步生成的CSV文件路径
        output_csv_path (str): 修复后的输出CSV文件路径
        model_name (str): Gemini模型名称
        max_retries (int): 最大重试次数
        semaphore_limit (int): 并发限制
    
    Returns:
        pd.DataFrame: 修复后的DataFrame
    """
    
    # 限制并发数
    semaphore = asyncio.Semaphore(semaphore_limit)
    
    # 使用相同的prompt模板
    prompt_template = """
You are analyzing graduation requirements for a graduate program. You have been given 3 different responses about what graduation requirements exist beyond regular coursework for this program.

Here are the 3 responses:

Response 1: "{response1}"
Response 2: "{response2}"  
Response 3: "{response3}"

TASK:
Based on these 3 responses, determine what the actual graduation requirement is using majority voting principles. If 2 or more responses say the same thing, that should be your answer. If all responses are different, choose the most specific and detailed one.

INSTRUCTIONS:
1. Read all 3 responses carefully
2. Identify what graduation requirement (beyond coursework) each response mentions
3. Apply majority voting - if 2+ responses agree, use that answer
4. If responses conflict, choose the most detailed/specific one
5. Give your final answer as a clear, concise statement

OUTPUT FORMAT:
Provide your answer as a single clear sentence describing the graduation requirement. Examples:
- "Capstone project required"
- "Research thesis required" 
- "Comprehensive exam required"
- "No additional graduation requirement"
- "Internship required"
- "Thesis or capstone project option available"

Do not explain your reasoning or provide multiple sentences. Just give the final requirement in one clear sentence.

FINAL ANSWER:
"""

    async def retry_single_project(row, attempt_num):
        """重试单个项目的API调用"""
        async with semaphore:
            # 构建prompt
            prompt = prompt_template.format(
                response1=row.get('原始回答1', 'No response provided'),
                response2=row.get('原始回答2', 'No response provided'),
                response3=row.get('原始回答3', 'No response provided')
            )
            
            # 调用Gemini API
            try:
                response = await async_call_gemini(
                    prompt,
                    model_name=model_name,
                    use_search=False,
                    url_context=False
                )
                
                # 处理API响应
                if isinstance(response, str) and response.startswith("Error:"):
                    return {
                        'final_requirement': f'API_ERROR_RETRY_{attempt_num}: {response}',
                        'raw_api_response': response,
                        'status': 'API_ERROR',
                        'retry_attempt': attempt_num
                    }
                
                if not hasattr(response, "candidates") or not response.candidates:
                    return {
                        'final_requirement': f'API_ERROR_RETRY_{attempt_num}: No candidates returned',
                        'raw_api_response': 'No candidates returned',
                        'status': 'API_ERROR',
                        'retry_attempt': attempt_num
                    }
                
                # 提取回答文本
                try:
                    api_response_text = response.candidates[0].content.parts[0].text.strip()
                except Exception as e:
                    return {
                        'final_requirement': f'PARSE_ERROR_RETRY_{attempt_num}: {e}',
                        'raw_api_response': str(response),
                        'status': 'PARSE_ERROR',
                        'retry_attempt': attempt_num
                    }
                
                # 提取最终答案
                final_answer = api_response_text
                if "FINAL ANSWER:" in api_response_text:
                    final_answer = api_response_text.split("FINAL ANSWER:")[-1].strip()
                
                # 清理答案
                final_answer = final_answer.replace('\n', ' ').strip()
                if final_answer.startswith('-'):
                    final_answer = final_answer[1:].strip()
                if final_answer.startswith('"') and final_answer.endswith('"'):
                    final_answer = final_answer[1:-1].strip()
                
                # 验证答案是否合法（不为空且不包含错误标识）
                if (final_answer and 
                    len(final_answer.strip()) > 0 and 
                    not final_answer.upper().startswith(('API_ERROR', 'PARSE_ERROR', 'EXCEPTION_ERROR'))):
                    return {
                        'final_requirement': final_answer,
                        'raw_api_response': api_response_text,
                        'status': 'SUCCESS',
                        'retry_attempt': attempt_num
                    }
                else:
                    return {
                        'final_requirement': f'INVALID_RESPONSE_RETRY_{attempt_num}: {final_answer}',
                        'raw_api_response': api_response_text,
                        'status': 'PARSE_ERROR',
                        'retry_attempt': attempt_num
                    }
                
            except Exception as e:
                return {
                    'final_requirement': f'EXCEPTION_ERROR_RETRY_{attempt_num}: {e}',
                    'raw_api_response': '',
                    'status': 'EXCEPTION_ERROR',
                    'retry_attempt': attempt_num
                }
    
    # 读取CSV文件
    print(f"读取CSV文件: {csv_file_path}")
    df = pd.read_csv(csv_file_path, encoding='utf-8-sig')
    
    print(f"总项目数: {len(df)}")
    
    # 找出需要修复的行
    error_conditions = (
        (df['处理状态'] == 'PARSE_ERROR') | 
        (df['处理状态'] == 'API_ERROR') |
        (df['处理状态'] == 'EXCEPTION_ERROR') |
        (df['Gemini最终毕业要求'].str.contains('ERROR', case=False, na=False))
    )
    
    error_rows = df[error_conditions].copy()
    success_rows = df[~error_conditions].copy()
    
    print(f"需要修复的项目数: {len(error_rows)}")
    print(f"已成功的项目数: {len(success_rows)}")
    
    if len(error_rows) == 0:
        print("没有需要修复的项目!")
        return df
    
    # 对错误行进行重试
    fixed_rows = []
    
    for retry_attempt in range(1, max_retries + 1):
        if len(error_rows) == 0:
            break
            
        print(f"\n开始第 {retry_attempt} 次重试，处理 {len(error_rows)} 个项目...")
        
        # 并行重试
        tasks = [retry_single_project(row, retry_attempt) for _, row in error_rows.iterrows()]
        retry_results = await tqdm_asyncio.gather(*tasks)
        
        # 处理重试结果
        new_fixed_rows = []
        remaining_error_rows = []
        
        for (idx, row), retry_result in zip(error_rows.iterrows(), retry_results):
            # 更新行数据
            updated_row = row.copy()
            updated_row['Gemini最终毕业要求'] = retry_result['final_requirement']
            updated_row['处理状态'] = retry_result['status']
            updated_row['Gemini完整回答'] = retry_result['raw_api_response']
            updated_row[f'重试第{retry_attempt}次'] = f"状态: {retry_result['status']}, 结果: {retry_result['final_requirement'][:50]}..."
            
            if retry_result['status'] == 'SUCCESS':
                new_fixed_rows.append(updated_row)
                print(f"✓ 项目 {updated_row['序号']} 修复成功: {retry_result['final_requirement'][:50]}...")
            else:
                remaining_error_rows.append(updated_row)
        
        # 更新列表
        fixed_rows.extend(new_fixed_rows)
        error_rows = pd.DataFrame(remaining_error_rows) if remaining_error_rows else pd.DataFrame()
        
        print(f"第 {retry_attempt} 次重试结果:")
        print(f"  成功修复: {len(new_fixed_rows)} 个项目")
        print(f"  仍有错误: {len(remaining_error_rows)} 个项目")
    
    # 合并所有结果
    all_results = []
    
    # 添加原本成功的行
    for _, row in success_rows.iterrows():
        all_results.append(row)
    
    # 添加修复成功的行
    for row in fixed_rows:
        all_results.append(row)
    
    # 添加最终仍失败的行
    if len(error_rows) > 0:
        for _, row in error_rows.iterrows():
            all_results.append(row)
    
    # 创建最终DataFrame
    final_df = pd.DataFrame(all_results)
    final_df = final_df.sort_values('序号').reset_index(drop=True)
    
    # 保存结果
    final_df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
    
    # 打印最终统计
    print(f"\n=== 最终修复结果统计 ===")
    print(f"总项目数: {len(final_df)}")
    
    final_status_counts = final_df['处理状态'].value_counts()
    for status, count in final_status_counts.items():
        percentage = count / len(final_df) * 100
        print(f"{status}: {count} 个项目 ({percentage:.1f}%)")
    
    success_count = final_status_counts.get('SUCCESS', 0)
    improvement = success_count - len(success_rows)
    print(f"\n修复效果:")
    print(f"修复前成功项目: {len(success_rows)}")
    print(f"修复后成功项目: {success_count}")
    print(f"新增成功项目: {improvement}")
    print(f"最终成功率: {success_count/len(final_df)*100:.1f}%")
    
    print(f"\n修复后的结果已保存到: {output_csv_path}")
    
    return final_df

# 使用示例
async def main():
    """主函数"""
    csv_file_path = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/Capstone或Thesis/Capstone或Thesis.csv"
    output_csv_path = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/Capstone或Thesis/Capstone或Thesis_修复版.csv"
    
    df = await fix_gemini_errors(
        csv_file_path=csv_file_path,
        output_csv_path=output_csv_path,
        model_name="gemini-2.5-flash",
        max_retries=2,
        semaphore_limit=2
    )
    
    return df

# 在Jupyter中运行
if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()
    
    # 运行函数
    df = asyncio.run(main())

读取CSV文件: /Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/Capstone或Thesis/Capstone或Thesis.csv
总项目数: 350
需要修复的项目数: 36
已成功的项目数: 314

开始第 1 次重试，处理 36 个项目...


100%|██████████| 36/36 [02:27<00:00,  4.10s/it]


✓ 项目 11 修复成功: A Master's thesis, Master's portfolio, or Master's...
✓ 项目 20 修复成功: A research dissertation is required....
✓ 项目 42 修复成功: Comprehensive examination and research dissertatio...
✓ 项目 46 修复成功: Research thesis required....
✓ 项目 48 修复成功: No additional graduation requirement....
✓ 项目 50 修复成功: No additional graduation requirement....
✓ 项目 71 修复成功: Two three-month research rotations, a qualifying e...
✓ 项目 79 修复成功: A preliminary assessment examination, a research q...
✓ 项目 83 修复成功: A summer research paper, a first-year screening ex...
✓ 项目 109 修复成功: No additional graduation requirement....
✓ 项目 110 修复成功: Research thesis option available....
✓ 项目 124 修复成功: Comprehensive exams, an applied data analysis proj...
✓ 项目 200 修复成功: No additional graduation requirement....
✓ 项目 211 修复成功: Research thesis with oral defense or a research pr...
✓ 项目 212 修复成功: Comprehensive qualifying examination, original res...
✓ 项目 213 修复成功: Capstone project required....
✓ 项目 221 修复成功: Capstone project, mast

100%|██████████| 13/13 [01:06<00:00,  5.15s/it]

✓ 项目 6 修复成功: Qualifying exams, practical work resulting in a pr...
✓ 项目 58 修复成功: A Capstone Research Project is required....
✓ 项目 69 修复成功: Master's thesis and oral examination required for ...
✓ 项目 129 修复成功: No additional graduation requirement....
✓ 项目 135 修复成功: Attendance at 12 colloquia and one year of relevan...
✓ 项目 154 修复成功: A thesis or special problem and statistical consul...
✓ 项目 224 修复成功: A 3-credit capstone project and a final examinatio...
✓ 项目 240 修复成功: Comprehensive exam required...
✓ 项目 283 修复成功: Capstone project with professional placement requi...
第 2 次重试结果:
  成功修复: 9 个项目
  仍有错误: 4 个项目

=== 最终修复结果统计 ===
总项目数: 350
SUCCESS: 346 个项目 (98.9%)
PARSE_ERROR: 4 个项目 (1.1%)

修复效果:
修复前成功项目: 314
修复后成功项目: 346
新增成功项目: 32
最终成功率: 98.9%

修复后的结果已保存到: /Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/Capstone或Thesis/Capstone或Thesis_修复版.csv



