In [1]:
import pandas as pd
import sys
import os
import json
from tqdm import tqdm
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/Major/Math')
import asyncio
from call_api import call_gemini, async_call_gemini
from tqdm.asyncio import tqdm_asyncio

field_name = "申请费减免"
field_path = f"/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/Major/Math/fields_csv/{field_name}.csv"
field_df = pd.read_csv(field_path)

field_df.shape

(454, 9)

In [2]:
prompt_template = """
You are an assistant whose only task is to determine whether the graduate program below **offers an application-fee waiver**.

────────────────────────────────────────────────────────
How to verify

1. **Use primary *.edu* sources only**  
   • Review the Admissions and Program URLs supplied.  
   • You may open other pages within the same university’s *.edu* domain (e.g., “Application Requirements”, “Fee Waiver”, “FAQ”).  
   ⛔  Disregard all non-*.edu* sites, blogs, forums, press releases, or rankings.

2. **Optional Google search**  
   Query once:  
   "{university} {department} {degree} {program} application fee waiver site:.edu"  
   Check only *.edu* results until you find an authoritative statement.

3. **Decision rules**  
   • If any page clearly states that applicants **may obtain a fee waiver** under any condition, output **applicable**.  
   • If a page explicitly says **no waivers are available**, output **not applicable**.  
   • If neither situation is found, output **not mentioned**.  
   • Never infer or invent; no citations or explanations are allowed.

────────────────────────────────────────────────────────
⚠️  Output format (exactly one line, all lowercase, no quotes, no extra text)

applicable  
not applicable  
not mentioned  

────────────────────────────────────────────────────────
Pages to consult first:
• Admissions URL: {admissions_url}  
• Program URL:    {program_url}

Does this program offer an application-fee waiver?
"""

In [3]:
import os
import json
import asyncio
from tqdm.asyncio import tqdm_asyncio

# Async Gemini wrapper
from call_api import async_call_gemini

# ---------------------------------------------------------------------------
# Concurrency guard – avoid hitting rate-limits
# ---------------------------------------------------------------------------
semaphore = asyncio.Semaphore(2)            # max concurrent rows

# ---------------------------------------------------------------------------
# Per-row worker
# ---------------------------------------------------------------------------
async def process_row(row, prompt_template, num_vote: int, model_name: str):
    """
    1. Format the prompt for this row
    2. Launch `num_vote` Gemini calls in parallel
    3. Capture BOTH normal answers *and* every possible error case
    4. Return a serialisable record
    """
    async with semaphore:
        row    = row.to_dict()
        prompt = prompt_template.format(
            university     = row["大学英文名称"],
            degree         = row["学位"],
            program        = row["专业英文名称"],
            department     = row["所属院系（英文）"],
            admissions_url = row["招生网址"],
            program_url    = row["专业网址"],
        )

        record: dict = row.copy()
        record["llm_reponses"] = {}

        # -------- launch Gemini calls in parallel --------------------
        tasks = [
            async_call_gemini(
                prompt,
                model_name=model_name,
                use_search=True,
                url_context=True
            )
            for _ in range(num_vote)
        ]
        responses = await asyncio.gather(*tasks)

        # -------- post-process each response -------------------------
        for i, response in enumerate(responses):
            resp_key = f"response {i+1}"

            # -- 1. Transport / server-side errors (string starting "Error:")
            if isinstance(response, str) and response.startswith("Error:"):
                record["llm_reponses"][resp_key] = {
                    "error": response                       # e.g. "Error: 429 Rate limit …"
                }
                continue

            # -- 2. Empty / malformed response objects
            if not hasattr(response, "candidates") or not response.candidates:
                record["llm_reponses"][resp_key] = {
                    "error": "No candidates returned",
                    "raw_response": str(response)
                }
                continue

            # -- 3. Extract main answer text
            try:
                text = response.candidates[0].content.parts[0].text
            except Exception as e:
                record["llm_reponses"][resp_key] = {
                    "error": f"Cannot parse text: {e}",
                    "raw_response": str(response)
                }
                continue

            # -- 4. Extract additional metadata (best-effort)
            try:
                url_context = str(response.candidates[0].url_context_metadata)
            except Exception:
                url_context = "Not used"

            try:
                search_pages = (
                    f"Search Chunks: "
                    f"{response.candidates[0].grounding_metadata.grounding_chunks}"
                )
            except Exception:
                search_pages = "Not used"

            try:
                search_queries = (
                    f"Search Query: "
                    f"{response.candidates[0].grounding_metadata.web_search_queries}"
                )
            except Exception:
                search_queries = "Not used"

            try:
                search_support = (
                    f"Search Supports: "
                    f"{response.candidates[0].grounding_metadata.groundingSupports}"
                )
            except Exception:
                search_support = "Not used"

            # -- 5. Store normal answer + metadata + raw object
            record["llm_reponses"][resp_key] = {
                "response_text": text,
                "url_context": url_context,
                "search_queries": search_queries,
                "search_pages": search_pages,
                "search_support": search_support,
                "raw_response": str(response)             # keep for deep-debugging
            }

        return record

# ---------------------------------------------------------------------------
# Batch orchestrator with tqdm progress bar
# ---------------------------------------------------------------------------
async def request_and_store_async(prompt_template,
                                  field_df,
                                  num_vote: int,
                                  model_name: str,
                                  start_from: int = 0,
                                  end_at: int = -1):
    """
    Runs `process_row` over the dataframe slice asynchronously,
    shows a live tqdm bar, and dumps the results to JSON.
    """
    df = field_df.copy()[start_from:end_at]

    # Spawn tasks for every row in the slice
    tasks = [
        process_row(row, prompt_template, num_vote, model_name)
        for _, row in df.iterrows()
    ]

    # tqdm_asyncio.gather gives us progress updates as tasks complete
    response_records = await tqdm_asyncio.gather(*tasks)

    # Persist to disk ------------------------------------------------
    output_dir = f"../fields_records/{field_name}"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/{field_name}_{model_name}_{start_from}_{end_at}.json"

    with open(output_path, "w") as f:
        json.dump(response_records, f, ensure_ascii=False, indent=2)

    return response_records

In [4]:
import nest_asyncio
nest_asyncio.apply()  # Only needed in Jupyter

num_vote = 3
start_from = 0
end_at = len(field_df)
model_name = "gemini-2.5-flash"
response_records = asyncio.run(
    request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=start_from, end_at=end_at)
)

100%|██████████| 454/454 [1:00:09<00:00,  7.95s/it]


In [5]:
json_file_path = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/Major/Math/fields_records/申请费减免/申请费减免_gemini-2.5-flash_0_454.json"

In [6]:
import json
import pandas as pd
import re
import os
from collections import Counter

def extract_fee_waiver_info(json_file_path):
    """
    从JSON文件中提取申请费减免信息，使用majority vote判断
    
    Args:
        json_file_path: JSON文件路径
    
    Returns:
        DataFrame: 处理后的数据
    """
    
    # 读取JSON文件
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    results = []
    
    for record in data:
        # 提取基本信息
        basic_info = {
            '大学英文名称': record.get('大学英文名称', '').strip(),
            '学位': record.get('学位', ''),
            '专业英文名称': record.get('专业英文名称', ''),
            '所属院系': record.get('所属院系', ''),
            '招生网址': record.get('招生网址', ''),
            '专业网址': record.get('专业网址', ''),
        }
        
        # 获取三个LLM responses
        llm_responses = record.get('llm_reponses', {})
        response_1 = llm_responses.get('response 1', {}).get('response_text', '').strip()
        response_2 = llm_responses.get('response 2', {}).get('response_text', '').strip()
        response_3 = llm_responses.get('response 3', {}).get('response_text', '').strip()
        
        # 分类每个回答
        def classify_response(response_text):
            """
            分类单个回答
            返回: 'applicable', 'not_applicable', 'invalid'
            """
            if not response_text or len(response_text.strip()) == 0:
                return 'invalid'
            
            # 检查无效回答
            invalid_patterns = [
                r'^not found$',
                r'^the\b',  # 以"The"开头的长篇回答通常无效
                r'^error',
                r'^no data',
                r'^unable to',
                r'^cannot find'
            ]
            
            response_lower = response_text.lower()
            for pattern in invalid_patterns:
                if re.search(pattern, response_lower, re.IGNORECASE):
                    return 'invalid'
            
            # 检查"not applicable"（优先级更高）
            if re.search(r'not applicable', response_lower, re.IGNORECASE):
                return 'not_applicable'
            
            # 检查"applicable"
            if re.search(r'applicable', response_lower, re.IGNORECASE):
                return 'applicable'
            
            # 如果都没有匹配，视为无效
            return 'invalid'
        
        # 分类三个回答
        classifications = []
        response_details = []
        
        for i, response in enumerate([response_1, response_2, response_3], 1):
            classification = classify_response(response)
            classifications.append(classification)
            response_details.append(f"Response {i}: {classification} - {response[:100]}{'...' if len(response) > 100 else ''}")
        
        # 过滤掉无效回答
        valid_classifications = [c for c in classifications if c != 'invalid']
        
        # 进行majority vote
        if len(valid_classifications) == 0:
            # 所有回答都无效
            final_decision = ""
            decision_status = "所有回答无效"
        elif len(valid_classifications) == 1:
            # 只有一个有效回答
            final_decision = valid_classifications[0].replace('_', ' ')
            decision_status = "只有一个有效回答"
        else:
            # 计算各类回答的数量
            vote_counts = Counter(valid_classifications)
            
            if len(vote_counts) == 1:
                # 所有有效回答一致
                final_decision = list(vote_counts.keys())[0].replace('_', ' ')
                decision_status = f"所有{len(valid_classifications)}个有效回答一致"
            else:
                # 回答不一致，检查是否有majority
                max_count = max(vote_counts.values())
                max_categories = [cat for cat, count in vote_counts.items() if count == max_count]
                
                if len(max_categories) == 1:
                    # 有明确的majority
                    final_decision = max_categories[0].replace('_', ' ')
                    decision_status = f"Majority vote: {max_count}/{len(valid_classifications)}"
                else:
                    # 平票情况
                    final_decision = ""
                    decision_status = f"平票: {dict(vote_counts)}"
        
        # 将所有信息合并
        result = basic_info.copy()
        result.update({
            '申请费减免': final_decision,
            '判断状态': decision_status,
            '有效回答数': len(valid_classifications),
            '总回答数': len([r for r in [response_1, response_2, response_3] if r.strip()]),
            'Response 1 分类': classify_response(response_1),
            'Response 2 分类': classify_response(response_2),
            'Response 3 分类': classify_response(response_3),
        })
        
        results.append(result)
    
    # 转换为DataFrame
    df = pd.DataFrame(results)
    
    # 保存CSV文件到同一目录
    output_dir = os.path.dirname(json_file_path)
    csv_filename = os.path.basename(json_file_path).replace('.json', '_processed.csv')
    output_path = os.path.join(output_dir, csv_filename)
    
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"处理完成！结果已保存到: {output_path}")
    print(f"共处理 {len(df)} 条记录")
    print(f"\n申请费减免分布:")
    print(df['申请费减免'].value_counts(dropna=False))
    print(f"\n判断状态分布:")
    print(df['判断状态'].value_counts())
    
    return df

df = extract_fee_waiver_info(json_file_path)

# 显示前几行数据预览
print("\n数据预览:")
print(df[['大学英文名称', '学位', '专业英文名称', '申请费减免', '判断状态']].head(10))

处理完成！结果已保存到: /Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/Major/Math/fields_records/申请费减免/申请费减免_gemini-2.5-flash_0_454_processed.csv
共处理 454 条记录

申请费减免分布:
申请费减免
applicable        386
                   46
not applicable     22
Name: count, dtype: int64

判断状态分布:
判断状态
所有3个有效回答一致                                    275
所有2个有效回答一致                                     82
只有一个有效回答                                       41
所有回答无效                                         40
Majority vote: 2/3                             10
平票: {'applicable': 1, 'not_applicable': 1}      3
平票: {'not_applicable': 1, 'applicable': 1}      3
Name: count, dtype: int64

数据预览:
                                  大学英文名称      学位  \
0                   Princeton University  Ph.D.    
1                   Princeton University  Ph.D.    
2  Massachusetts Institute of Technology     PhD   
3  Massachusetts Institute of Technology     ScD   
4                     Harvard University     PhD   
5           