In [4]:
import pandas as pd
import sys
import os
import json
from tqdm import tqdm
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/AppliedDataScience')
import asyncio
from call_api import call_gemini, async_call_gemini
from tqdm.asyncio import tqdm_asyncio

field_name = "面试"
field_path = f"/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/AppliedDataScience/fields_csv/{field_name}.csv"
field_df = pd.read_csv(field_path)

field_df.shape

(121, 10)

In [5]:
prompt_template = """
You are an assistant whose only task is to decide whether the graduate program below **requires any form of interview or video assessment** and to give a one-sentence justification.

ℹ️  What counts as an “interview”
• Live interviews (in-person or virtual) **and** asynchronous video tasks (e.g., Kira Talent, prerecorded video essays).  
• If the program labels the interview/video as *optional*, *by invitation*, or *for shortlisted applicants*, you must still treat it as **interview required**.  
• Return **No requirement** only when **no** interview or video component is mentioned on official *.edu* pages.

────────────────────────────────────────────────────────
How to verify  

1. **Primary *.edu* sources only**  
   • Check the Admissions and Program URLs below.  
   • You may open other pages on the same *.edu* domain (e.g., “Application Requirements”, “FAQ”, “Video Essay”, “Interview Process”).  
   ⛔  Ignore non-*.edu* sites, blogs, forums, or news articles.

2. **Google search (one query)**  
   "{university} {department} {degree} {program} interview video site:.edu"  
   Review *.edu* results only.

3. **Decision rules**  
   • If any page mentions a live interview or recorded video component—mandatory, optional, or by invitation—output **interview required**, then add a brief reason (e.g., “Official FAQ notes a required video interview”).  
   • If no reference appears on official *.edu* pages, output **No requirement**, then add a brief reason (e.g., “No interview mentioned on program or admissions pages”).  
   • Never infer or invent.

────────────────────────────────────────────────────────
⚠️  Output format (exactly one line, lowercase classification first, then a period and one short explanation)

interview required. Official site states <reason>  
No requirement. No interview mentioned on official pages  

────────────────────────────────────────────────────────
Pages to consult first:  
• Admissions URL: {admissions_url}  
• Program URL:    {program_url}

Does this program require an interview or video assessment?
"""


In [6]:
import os
import json
import asyncio
from tqdm.asyncio import tqdm_asyncio

# Async Gemini wrapper
from call_api import async_call_gemini

# ---------------------------------------------------------------------------
# Concurrency guard – avoid hitting rate-limits
# ---------------------------------------------------------------------------
semaphore = asyncio.Semaphore(3)            # max concurrent rows

# ---------------------------------------------------------------------------
# Per-row worker
# ---------------------------------------------------------------------------
async def process_row(row, prompt_template, num_vote: int, model_name: str):
    """
    1. Format the prompt for this row
    2. Launch `num_vote` Gemini calls in parallel
    3. Capture BOTH normal answers *and* every possible error case
    4. Return a serialisable record
    """
    async with semaphore:
        row    = row.to_dict()
        prompt = prompt_template.format(
            university     = row["大学英文名称"],
            degree         = row["学位"],
            program        = row["专业英文名称"],
            department     = row["所属院系（英文）"],
            admissions_url = row["招生网址"],
            program_url    = row["专业网址"],
        )

        record: dict = row.copy()
        record["llm_reponses"] = {}

        # -------- launch Gemini calls in parallel --------------------
        tasks = [
            async_call_gemini(
                prompt,
                model_name=model_name,
                use_search=True,
                url_context=True
            )
            for _ in range(num_vote)
        ]
        responses = await asyncio.gather(*tasks)

        # -------- post-process each response -------------------------
        for i, response in enumerate(responses):
            resp_key = f"response {i+1}"

            # -- 1. Transport / server-side errors (string starting "Error:")
            if isinstance(response, str) and response.startswith("Error:"):
                record["llm_reponses"][resp_key] = {
                    "error": response                       # e.g. "Error: 429 Rate limit …"
                }
                continue

            # -- 2. Empty / malformed response objects
            if not hasattr(response, "candidates") or not response.candidates:
                record["llm_reponses"][resp_key] = {
                    "error": "No candidates returned",
                    "raw_response": str(response)
                }
                continue

            # -- 3. Extract main answer text
            try:
                text = response.candidates[0].content.parts[0].text
            except Exception as e:
                record["llm_reponses"][resp_key] = {
                    "error": f"Cannot parse text: {e}",
                    "raw_response": str(response)
                }
                continue

            # -- 4. Extract additional metadata (best-effort)
            try:
                url_context = str(response.candidates[0].url_context_metadata)
            except Exception:
                url_context = "Not used"

            try:
                search_pages = (
                    f"Search Chunks: "
                    f"{response.candidates[0].grounding_metadata.grounding_chunks}"
                )
            except Exception:
                search_pages = "Not used"

            try:
                search_queries = (
                    f"Search Query: "
                    f"{response.candidates[0].grounding_metadata.web_search_queries}"
                )
            except Exception:
                search_queries = "Not used"

            try:
                search_support = (
                    f"Search Supports: "
                    f"{response.candidates[0].grounding_metadata.groundingSupports}"
                )
            except Exception:
                search_support = "Not used"

            # -- 5. Store normal answer + metadata + raw object
            record["llm_reponses"][resp_key] = {
                "response_text": text,
                "url_context": url_context,
                "search_queries": search_queries,
                "search_pages": search_pages,
                "search_support": search_support,
                "raw_response": str(response)             # keep for deep-debugging
            }

        return record

# ---------------------------------------------------------------------------
# Batch orchestrator with tqdm progress bar
# ---------------------------------------------------------------------------
async def request_and_store_async(prompt_template,
                                  field_df,
                                  num_vote: int,
                                  model_name: str,
                                  start_from: int = 0,
                                  end_at: int = -1):
    """
    Runs `process_row` over the dataframe slice asynchronously,
    shows a live tqdm bar, and dumps the results to JSON.
    """
    df = field_df.copy()[start_from:end_at]

    # Spawn tasks for every row in the slice
    tasks = [
        process_row(row, prompt_template, num_vote, model_name)
        for _, row in df.iterrows()
    ]

    # tqdm_asyncio.gather gives us progress updates as tasks complete
    response_records = await tqdm_asyncio.gather(*tasks)

    # Persist to disk ------------------------------------------------
    output_dir = f"../fields_records/{field_name}"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/{field_name}_{model_name}_{start_from}_{end_at}_newnewp.json"

    with open(output_path, "w") as f:
        json.dump(response_records, f, ensure_ascii=False, indent=2)

    return response_records

In [7]:
import nest_asyncio
nest_asyncio.apply()  # Only needed in Jupyter

num_vote = 3
start_from = 0
end_at = len(field_df)
model_name = "gemini-2.5-flash"
response_records = asyncio.run(
    request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=start_from, end_at=end_at)
)

100%|██████████| 121/121 [10:09<00:00,  5.03s/it]


In [8]:
json_file_path = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/AppliedDataScience/fields_records/面试/面试_gemini-2.5-flash_0_121_newnewp.json"

In [9]:
import json
import pandas as pd
import re
import os

def extract_interview_requirements(json_file_path):
    """
    从JSON文件中提取面试需求信息
    
    Args:
        json_file_path: JSON文件路径
    
    Returns:
        DataFrame: 处理后的数据
    """
    
    # 读取JSON文件
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    results = []
    
    for record in data:
        # 提取基本信息
        basic_info = {
            '大学英文名称': record.get('大学英文名称', '').strip(),
            '学位': record.get('学位', ''),
            '专业英文名称': record.get('专业英文名称', ''),
            '所属院系': record.get('所属院系', ''),
            '所属院系（英文）': record.get('所属院系（英文）', ''),
        }
        
        # 检查是否为PhD项目
        phd = str(basic_info['学位'])
        if "p" in phd.lower() and "h" in phd.lower() and "d" in phd.lower():
            is_phd = True
        else:
            is_phd = False
        
        
        if is_phd:
            # PhD项目直接返回空
            result = basic_info.copy()
            result.update({
                '面试': '',
                '判定状态': 'PhD项目-默认跳过'
            })
            results.append(result)
            continue
        
        # 处理Master项目的LLM回答
        responses = record.get('llm_reponses', {})
        valid_responses = []
        
        for response_key in ['response 1', 'response 2', 'response 3']:
            if response_key in responses:
                response_text = responses[response_key].get('response_text', '').strip()
                
                # 检查是否为无效回答
                invalid_patterns = [
                    r'^not found$',
                    r'^the$',
                    r'^error',
                    r'^no data',
                    r'^unable to',
                    r'^could not',
                    r'^\w{1,3}$'  # 太短的回答（1-3个字符）
                ]
                
                is_invalid = any(re.search(pattern, response_text.lower()) for pattern in invalid_patterns)
                
                if not is_invalid and response_text:
                    # 判断回答内容
                    response_lower = response_text.lower()
                    
                    if 'no requirement' in response_lower:
                        valid_responses.append('不需要')
                    elif any(keyword in response_lower for keyword in ['interview required', 'interview is required', 'requires interview']):
                        valid_responses.append('需要')
                    elif len(response_text) < 20 and 'no' in response_lower:
                        # 短回答包含"no"的情况
                        valid_responses.append('不需要')
                    else:
                        # 其他长回答，检查是否暗示不需要面试
                        no_interview_indicators = [
                            'do not mention',
                            'does not mention', 
                            'not explicitly mention',
                            'no explicit mention',
                            'not listed',
                            'not required',
                            'not specified'
                        ]
                        
                        if any(indicator in response_lower for indicator in no_interview_indicators):
                            valid_responses.append('不需要')
                        else:
                            # 如果无法明确判断，暂时归类为无效
                            pass
        
        # 投票决定结果
        result = basic_info.copy()
        
        if not valid_responses:
            result.update({
                '面试': '',
                '判定状态': '无有效回答'
            })
        else:
            need_count = valid_responses.count('需要')
            no_need_count = valid_responses.count('不需要')
            
            if need_count > no_need_count:
                result.update({
                    '面试': '需要',
                    '判定状态': f'投票结果-需要({need_count}票) vs 不需要({no_need_count}票)'
                })
            elif no_need_count > need_count:
                result.update({
                    '面试': '不需要',
                    '判定状态': f'投票结果-不需要({no_need_count}票) vs 需要({need_count}票)'
                })
            else:
                result.update({
                    '面试': '',
                    '判定状态': f'投票平局-需要({need_count}票) vs 不需要({no_need_count}票)'
                })
        
        results.append(result)
    
    return pd.DataFrame(results)

def process_interview_json(json_file_path):
    """
    处理面试JSON文件并保存结果
    """
    
    # 提取数据
    df = extract_interview_requirements(json_file_path)
    
    # 生成输出文件名
    output_path = os.path.join(os.path.dirname(json_file_path), "面试需求_处理结果_newp.csv")
    
    # 保存为CSV
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    
    # 统计信息
    total_records = len(df)
    phd_records = len(df[df['判定状态'].str.contains('PhD项目')])
    need_interview = len(df[df['面试'] == '需要'])
    no_need_interview = len(df[df['面试'] == '不需要'])
    unclear_records = len(df[df['面试'] == ''])
    
    print(f"处理完成！结果已保存到: {output_path}")
    print(f"总共处理了 {total_records} 条记录")
    print(f"PhD项目（跳过）: {phd_records} 条")
    print(f"需要面试: {need_interview} 条")
    print(f"不需要面试: {no_need_interview} 条")
    print(f"无法确定: {unclear_records} 条")
    
    print(f"\n前5条结果预览:")
    print(df[['大学英文名称', '专业英文名称', '学位', '面试', '判定状态']].head())
    
    return df

# 使用示例
df = process_interview_json(json_file_path)

处理完成！结果已保存到: /Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/AppliedDataScience/fields_records/面试/面试需求_处理结果_newp.csv
总共处理了 121 条记录
PhD项目（跳过）: 30 条
需要面试: 10 条
不需要面试: 81 条
无法确定: 30 条

前5条结果预览:
                                  大学英文名称  \
0  Massachusetts Institute of Technology   
1  Massachusetts Institute of Technology   
2  Massachusetts Institute of Technology   
3                     Harvard University   
4                     Harvard University   

                                              专业英文名称        学位   面试  \
0  Civil and Environmental Engineering：Data Scien...      Meng  不需要   
1                     Social and Engineering Systems       PhD        
2            Data, Economics, and Development Policy       MAS  不需要   
3                            Master in Public Policy       MPP  不需要   
4  Master in Public Administration in Internation...  MPP / ID  不需要   

                     判定状态  
0  投票结果-不需要(3票) vs 需要(0票)  
1              PhD项目-默认跳过  
2  投票结果-不需