In [1]:
import pandas as pd
import sys
import os
import json
from tqdm import tqdm
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience')
import asyncio
from call_api import call_gemini, async_call_gemini
from tqdm.asyncio import tqdm_asyncio

field_name = "推荐信"
field_path = f"/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_csv/{field_name}.csv"
field_df = pd.read_csv(field_path)

field_df.shape

(351, 7)

In [2]:
prompt_template = """
You are an assistant that checks how many letters of recommendation are required for the following graduate program.

1. Search the admissions and program webpages provided.  
2. Use Google search to find more information.

Answer instructions:
Give your response in a short sentence. In most cases, the number of letters of recommendation is 2 or 3, so you should return "2 letters of recommendation" or "3 letters of recommendation", with nothing else.
In rare cases, the program does not explicitly mention the number of letters of recommendation, you should return "No explicit demands on letters of recommendation."

Example response:
"3 letters of recommendation."
"2~3 letters of recommendation."
"3~4 letter of recommendation."
"Not mentioned." (rare cases)

Use Google to search **"{university} {degree} {program} {department} number of letters of recommendation"** for more information.

URLs you should check:
• Admissions URL: {admissions_url}  
• Program URL: {program_url}

Here are your response:
"""

In [3]:
semaphore = asyncio.Semaphore(5) 

async def process_row(row, prompt_template, num_vote, model_name):
    async with semaphore:
        row = row.to_dict()
        prompt = prompt_template.format(
            university     = row["大学英文名称"],
            degree         = row["学位"],
            program        = row["专业英文名称"],
            department     = row["所属院系"],
            admissions_url = row["招生网址"],
            program_url    = row["专业网址"],
        )
        record = row.copy()
        record["llm_reponses"] = {}

        # Launch all API calls in parallel for this row
        tasks = [
            async_call_gemini(prompt, model_name=model_name, use_search=True, url_context=True)
            for _ in range(num_vote)
        ]
        responses = await asyncio.gather(*tasks)

        for i, response in enumerate(responses):
            try:
                text = response.candidates[0].content.parts[0].text
            except:
                text = ''
            try:
                url_context = str(response.candidates[0].url_context_metadata)
            except:
                url_context = "Not used"
            try:
                search_pages = f"Search Chunks: {response.candidates[0].grounding_metadata.grounding_chunks}"
            except:
                search_pages = "Not used"
            try:
                search_queries = f"Search Query: {response.candidates[0].grounding_metadata.web_search_queries}"
            except:
                search_queries = "Not used"
            try:
                search_support = f"Search Query: {response.candidates[0].grounding_metadata.groundingSupports}"
            except:
                search_support = "Not used"

            record["llm_reponses"][f"response {i+1}"] = {
                "response_text": text,
                "url_context": url_context,
                "search_queries": search_queries,
                "search_pages": search_pages,
                "search_support": search_support,
            }
        return record

async def request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=0, end_at=-1):
    df = field_df.copy()[start_from:end_at]
    response_records = []

    # Create tasks for all rows
    tasks = [
        process_row(row, prompt_template, num_vote, model_name)
        for _, row in df.iterrows()
    ]
    # Run all row tasks in parallel (limit concurrency if needed)
    response_records = await tqdm_asyncio.gather(*tasks)

    # Save results
    with open(f"../fields_records/{field_name}/{field_name}_{model_name}_{start_from}_{end_at}.json", "w") as f:
        json.dump(response_records, f, ensure_ascii=False, indent=2)
    return response_records

In [5]:
import nest_asyncio
nest_asyncio.apply()  # Only needed in Jupyter

num_vote = 3
start_from = 0
end_at = -1
model_name = "gemini-2.5-flash"
response_records = asyncio.run(
    request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=start_from, end_at=end_at)
)

100%|██████████| 350/350 [15:13<00:00,  2.61s/it]


In [3]:
import json
import pandas as pd
import re
import os
from collections import Counter

def extract_recommendation_letters_info(json_file_path):
    """
    从JSON文件中提取推荐信数量信息，使用majority vote判断
    
    Args:
        json_file_path: JSON文件路径
    
    Returns:
        DataFrame: 处理后的数据
    """
    
    # 读取JSON文件
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    results = []
    
    for record in data:
        # 提取基本信息
        basic_info = {
            '大学英文名称': record.get('大学英文名称', '').strip(),
            '学位': record.get('学位', ''),
            '专业英文名称': record.get('专业英文名称', ''),
            '所属院系': record.get('所属院系', ''),
            '招生网址': record.get('招生网址', ''),
            '专业网址': record.get('专业网址', ''),
        }
        
        # 获取三个LLM responses
        llm_responses = record.get('llm_reponses', {})
        response_1 = llm_responses.get('response 1', {}).get('response_text', '').strip()
        response_2 = llm_responses.get('response 2', {}).get('response_text', '').strip()
        response_3 = llm_responses.get('response 3', {}).get('response_text', '').strip()
        
        # 数字映射字典
        number_mapping = {
            'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
            '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5
        }
        
        def classify_response(response_text):
            """
            分类单个回答
            返回: 推荐信数量(int) 或 'no_explicit_demand' 或 'invalid'
            """
            if not response_text or len(response_text.strip()) == 0:
                return 'invalid'
            
            response_lower = response_text.lower()
            
            # 检查无效回答
            invalid_patterns = [
                r'^not found$',
                r'^error',
                r'^no data',
                r'^unable to',
                r'^cannot find',
                r'^the\s+.{50,}',  # 以"The"开头的长篇回答
                r'incomplete',
                r'insufficient'
            ]
            
            for pattern in invalid_patterns:
                if re.search(pattern, response_lower, re.IGNORECASE):
                    return 'invalid'
            
            # 检查"no explicit demand"类型的回答
            no_demand_patterns = [
                r'no explicit demands?\s+on\s+letters?\s+of\s+recommendation',
                r'not mentioned',
                r'no\s+specific\s+requirement',
                r'no\s+requirement',
                r'no\s+explicit\s+requirement',
                r'not\s+specified',
                r'not\s+stated',
                r'no\s+information',
                r'does\s+not\s+mention'
            ]
            
            for pattern in no_demand_patterns:
                if re.search(pattern, response_lower, re.IGNORECASE):
                    return 'no_explicit_demand'
            
            # 提取数字
            # 匹配模式：X letters of recommendation, X letter of recommendation等
            number_patterns = [
                r'(\d+)\s+letters?\s+of\s+recommendation',
                r'(\w+)\s+letters?\s+of\s+recommendation',
                r'requires?\s+(\d+)\s+letters?',
                r'requires?\s+(\w+)\s+letters?',
                r'(\d+)\s+recommendation\s+letters?',
                r'(\w+)\s+recommendation\s+letters?'
            ]
            
            extracted_numbers = []
            
            for pattern in number_patterns:
                matches = re.findall(pattern, response_lower, re.IGNORECASE)
                for match in matches:
                    if match.lower() in number_mapping:
                        extracted_numbers.append(number_mapping[match.lower()])
                    elif match.isdigit():
                        num = int(match)
                        if 0 <= num <= 5:  # 合理范围内的推荐信数量
                            extracted_numbers.append(num)
            
            # 如果找到了数字，返回最常见的数字
            if extracted_numbers:
                return max(set(extracted_numbers), key=extracted_numbers.count)
            
            # 如果没有找到明确的数字，检查是否有其他相关信息
            return 'invalid'
        
        # 分类三个回答
        classifications = []
        response_details = []
        
        for i, response in enumerate([response_1, response_2, response_3], 1):
            classification = classify_response(response)
            classifications.append(classification)
            response_details.append(f"Response {i}: {classification} - {response[:100]}{'...' if len(response) > 100 else ''}")
        
        # 过滤掉无效回答
        valid_classifications = [c for c in classifications if c != 'invalid']
        
        # 进行判断
        if len(valid_classifications) == 0:
            # 所有回答都无效
            final_decision = ""
            decision_status = "所有回答无效"
        elif len(valid_classifications) == 1:
            # 只有一个有效回答
            if valid_classifications[0] == 'no_explicit_demand':
                final_decision = "no explicit demand"
            else:
                final_decision = f"{valid_classifications[0]} letters of recommendation"
            decision_status = "只有一个有效回答"
        else:
            # 多个有效回答
            # 分离数字回答和"no explicit demand"回答
            numeric_answers = [c for c in valid_classifications if isinstance(c, int)]
            no_demand_answers = [c for c in valid_classifications if c == 'no_explicit_demand']
            
            if len(no_demand_answers) == len(valid_classifications):
                # 所有有效回答都是"no explicit demand"
                final_decision = "no explicit demand"
                decision_status = f"所有{len(valid_classifications)}个有效回答都是no explicit demand"
            elif len(numeric_answers) == len(valid_classifications):
                # 所有有效回答都是数字
                if len(set(numeric_answers)) == 1:
                    # 所有数字相同
                    final_decision = f"{numeric_answers[0]} letters of recommendation"
                    decision_status = f"所有{len(valid_classifications)}个有效回答一致"
                else:
                    # 数字不同，使用majority vote或取更高的数字
                    vote_counts = Counter(numeric_answers)
                    max_count = max(vote_counts.values())
                    max_numbers = [num for num, count in vote_counts.items() if count == max_count]
                    
                    if len(max_numbers) == 1:
                        # 有明确的majority
                        final_decision = f"{max_numbers[0]} letters of recommendation"
                        decision_status = f"Majority vote: {max_numbers[0]} ({max_count}/{len(numeric_answers)})"
                    else:
                        # 平票，取更高的数字
                        final_number = max(max_numbers)
                        final_decision = f"{final_number} letters of recommendation"
                        decision_status = f"平票情况，取更高数字: {final_number}"
            else:
                # 混合回答（数字+no explicit demand）
                if len(numeric_answers) >= len(no_demand_answers):
                    # 数字回答更多或相等，使用数字结果
                    if len(set(numeric_answers)) == 1:
                        final_decision = f"{numeric_answers[0]} letters of recommendation"
                        decision_status = f"数字回答占主导，结果一致: {numeric_answers[0]}"
                    else:
                        # 取更高的数字
                        final_number = max(numeric_answers)
                        final_decision = f"{final_number} letters of recommendation"
                        decision_status = f"数字回答占主导，取更高数字: {final_number}"
                else:
                    # no explicit demand回答更多
                    final_decision = "no explicit demand"
                    decision_status = f"no explicit demand回答占主导"
        
        # 将所有信息合并
        result = basic_info.copy()
        result.update({
            '推荐信': final_decision,
            '判断状态': decision_status,
            '有效回答数': len(valid_classifications),
            '总回答数': len([r for r in [response_1, response_2, response_3] if r.strip()]),
            'Response 1 分类': classifications[0],
            'Response 2 分类': classifications[1],
            'Response 3 分类': classifications[2],
        })
        
        results.append(result)
    
    # 转换为DataFrame
    df = pd.DataFrame(results)
    
    # 保存CSV文件到同一目录
    output_dir = os.path.dirname(json_file_path)
    csv_filename = os.path.basename(json_file_path).replace('.json', '_processed.csv')
    output_path = os.path.join(output_dir, csv_filename)
    
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"处理完成！结果已保存到: {output_path}")
    print(f"共处理 {len(df)} 条记录")
    print(f"\n推荐信需求分布:")
    print(df['推荐信'].value_counts(dropna=False))
    print(f"\n判断状态分布:")
    print(df['判断状态'].value_counts())
    
    return df


json_file_path = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/推荐信/final_推荐信_gemini-2.5-flash_0_-1.json"
df = extract_recommendation_letters_info(json_file_path)

# 显示前几行数据预览
print("\n数据预览:")
print(df[['大学英文名称', '学位', '专业英文名称', '推荐信', '判断状态']].head(10))

处理完成！结果已保存到: /Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/推荐信/final_推荐信_gemini-2.5-flash_0_-1_processed.csv
共处理 350 条记录

推荐信需求分布:
推荐信
3 letters of recommendation    190
2 letters of recommendation    100
no explicit demand              48
1 letters of recommendation     11
                                 1
Name: count, dtype: int64

判断状态分布:
判断状态
所有3个有效回答一致                      227
所有2个有效回答一致                       45
所有3个有效回答都是no explicit demand     25
no explicit demand回答占主导          12
数字回答占主导，结果一致: 3                  12
所有2个有效回答都是no explicit demand      8
只有一个有效回答                          7
Majority vote: 2 (2/3)            4
Majority vote: 3 (2/3)            3
Majority vote: 1 (2/3)            2
数字回答占主导，结果一致: 1                   1
数字回答占主导，结果一致: 2                   1
平票情况，取更高数字: 3                     1
所有回答无效                            1
数字回答占主导，取更高数字: 3                  1
Name: count, dtype: int64

数据预览:
                       大学