In [1]:
import pandas as pd
import sys
import os
import json
from tqdm import tqdm
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience')
import asyncio
from call_api import call_gemini, async_call_gemini
from tqdm.asyncio import tqdm_asyncio

field_name = "所属院系（英文）"
field_path = f"/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_csv/{field_name}.csv"
field_df = pd.read_csv(field_path)

field_df.shape

(351, 7)

In [2]:
prompt_template = """
You are an assistant whose only task is to identify the **school / college / faculty** that houses the graduate program below and return its official homepage URL.

────────────────────────────────────────────────────────
How to find the information  

1. **Primary *.edu* sources only**  
   • Inspect the Admissions and Program URLs provided.  
   • Open additional pages within the same university’s *.edu* domain (e.g., “About the School”, “Departments”).  
   ⛔  Ignore non-*.edu* sites, blogs, rankings, or third-party profiles.

2. **Optional Google search**  
   Single query:  
   "{university} {degree} {program} school college site:.edu"  
   Check only *.edu* results until you locate the school’s homepage.

3. **Selection rules**  
   • Capture *one* clear school/college name (e.g., “School of Engineering and Applied Science”).  
   • Copy the most authoritative homepage URL for that school (https://…).  
   • If no reliable *.edu* source identifies the school, output **Not found**.  
   • Never invent.

────────────────────────────────────────────────────────
⚠️  Output format (exactly one line, no quotes, no extra text)  

<School / College Name>, <URL>  
or  
Not found  

Examples of valid outputs:  
School of Engineering and Applied Science, https://engineering.columbia.edu  
Graduate School of Arts and Sciences, https://gsas.harvard.edu  
Not found  

────────────────────────────────────────────────────────
Pages to consult first:  
• Admissions URL: {admissions_url}  
• Program URL:    {program_url}

What is the school/college and its homepage?
"""


In [3]:
import os
import json
import asyncio
from tqdm.asyncio import tqdm_asyncio

# Async Gemini wrapper
from call_api import async_call_gemini

# ---------------------------------------------------------------------------
# Concurrency guard – avoid hitting rate-limits
# ---------------------------------------------------------------------------
semaphore = asyncio.Semaphore(4)            # max concurrent rows

# ---------------------------------------------------------------------------
# Per-row worker
# ---------------------------------------------------------------------------
async def process_row(row, prompt_template, num_vote: int, model_name: str):
    """
    1. Format the prompt for this row
    2. Launch `num_vote` Gemini calls in parallel
    3. Capture BOTH normal answers *and* every possible error case
    4. Return a serialisable record
    """
    async with semaphore:
        row    = row.to_dict()
        prompt = prompt_template.format(
            university     = row["大学英文名称"],
            degree         = row["学位"],
            program        = row["专业英文名称"],
            department     = row["所属院系"],
            admissions_url = row["招生网址"],
            program_url    = row["专业网址"],
        )

        record: dict = row.copy()
        record["llm_reponses"] = {}

        # -------- launch Gemini calls in parallel --------------------
        tasks = [
            async_call_gemini(
                prompt,
                model_name=model_name,
                use_search=True,
                url_context=True
            )
            for _ in range(num_vote)
        ]
        responses = await asyncio.gather(*tasks)

        # -------- post-process each response -------------------------
        for i, response in enumerate(responses):
            resp_key = f"response {i+1}"

            # -- 1. Transport / server-side errors (string starting "Error:")
            if isinstance(response, str) and response.startswith("Error:"):
                record["llm_reponses"][resp_key] = {
                    "error": response                       # e.g. "Error: 429 Rate limit …"
                }
                continue

            # -- 2. Empty / malformed response objects
            if not hasattr(response, "candidates") or not response.candidates:
                record["llm_reponses"][resp_key] = {
                    "error": "No candidates returned",
                    "raw_response": str(response)
                }
                continue

            # -- 3. Extract main answer text
            try:
                text = response.candidates[0].content.parts[0].text
            except Exception as e:
                record["llm_reponses"][resp_key] = {
                    "error": f"Cannot parse text: {e}",
                    "raw_response": str(response)
                }
                continue

            # -- 4. Extract additional metadata (best-effort)
            try:
                url_context = str(response.candidates[0].url_context_metadata)
            except Exception:
                url_context = "Not used"

            try:
                search_pages = (
                    f"Search Chunks: "
                    f"{response.candidates[0].grounding_metadata.grounding_chunks}"
                )
            except Exception:
                search_pages = "Not used"

            try:
                search_queries = (
                    f"Search Query: "
                    f"{response.candidates[0].grounding_metadata.web_search_queries}"
                )
            except Exception:
                search_queries = "Not used"

            try:
                search_support = (
                    f"Search Supports: "
                    f"{response.candidates[0].grounding_metadata.groundingSupports}"
                )
            except Exception:
                search_support = "Not used"

            # -- 5. Store normal answer + metadata + raw object
            record["llm_reponses"][resp_key] = {
                "response_text": text,
                "url_context": url_context,
                "search_queries": search_queries,
                "search_pages": search_pages,
                "search_support": search_support,
                "raw_response": str(response)             # keep for deep-debugging
            }

        return record

# ---------------------------------------------------------------------------
# Batch orchestrator with tqdm progress bar
# ---------------------------------------------------------------------------
async def request_and_store_async(prompt_template,
                                  field_df,
                                  num_vote: int,
                                  model_name: str,
                                  start_from: int = 0,
                                  end_at: int = -1):
    """
    Runs `process_row` over the dataframe slice asynchronously,
    shows a live tqdm bar, and dumps the results to JSON.
    """
    df = field_df.copy()[start_from:end_at]

    # Spawn tasks for every row in the slice
    tasks = [
        process_row(row, prompt_template, num_vote, model_name)
        for _, row in df.iterrows()
    ]

    # tqdm_asyncio.gather gives us progress updates as tasks complete
    response_records = await tqdm_asyncio.gather(*tasks)

    # Persist to disk ------------------------------------------------
    output_dir = f"../fields_records/{field_name}"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/{field_name}_{model_name}_{start_from}_{end_at}.json"

    with open(output_path, "w") as f:
        json.dump(response_records, f, ensure_ascii=False, indent=2)

    return response_records

In [4]:
import nest_asyncio
nest_asyncio.apply()  # Only needed in Jupyter

num_vote = 3
start_from = 0
end_at = -1
model_name = "gemini-2.5-flash"
response_records = asyncio.run(
    request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=start_from, end_at=end_at)
)

100%|██████████| 350/350 [26:08<00:00,  4.48s/it]


In [9]:
import json
import pandas as pd
import random
import re
from collections import Counter

def extract_department_from_json(json_file_path, output_csv_path):
    """
    从JSON文件中提取所属院系（英文）信息
    
    Args:
        json_file_path: JSON文件路径
        output_csv_path: 输出CSV文件路径
    
    Returns:
        DataFrame: 处理后的数据
    """
    
    # 读取JSON文件
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    results = []
    
    for record in data:
        # 提取基本信息
        basic_info = {
            '大学英文名称': record.get('大学英文名称', ''),
            '学位': record.get('学位', ''),
            '专业英文名称': record.get('专业英文名称', ''),
            '所属院系': record.get('所属院系', ''),
            '招生网址': record.get('招生网址', ''),
            '专业网址': record.get('专业网址', ''),
        }
        
        # 提取LLM响应
        llm_responses = record.get('llm_reponses', {})
        
        # 提取三次回答的response_text
        response_texts = []
        for i in range(1, 4):
            response_key = f"response {i}"
            if response_key in llm_responses:
                response_text = llm_responses[response_key].get('response_text', '')
                response_texts.append(response_text)
            else:
                response_texts.append('')
        
        # 比较三个回答的一致性并选择最终答案
        selected_response, consistency_info, valid_responses = select_best_response(response_texts)
        
        # 解析选中的回答，提取院系名称和URL
        department_name, department_url = parse_response_text(selected_response)
        
        # 构建最终记录
        final_record = basic_info.copy()
        final_record.update({
            '所属院系（英文）': department_name,
            '所属院系网址': department_url,
            'response_1': response_texts[0] if len(response_texts) > 0 else '',
            'response_2': response_texts[1] if len(response_texts) > 1 else '',
            'response_3': response_texts[2] if len(response_texts) > 2 else '',
            'response_1_valid': is_valid_response(response_texts[0]) if len(response_texts) > 0 else False,
            'response_2_valid': is_valid_response(response_texts[1]) if len(response_texts) > 1 else False,
            'response_3_valid': is_valid_response(response_texts[2]) if len(response_texts) > 2 else False,
            'valid_response_count': len(valid_responses),
            'consistency_type': consistency_info['type'],
            'consistency_detail': consistency_info['detail']
        })
        
        results.append(final_record)
    
    # 转换为DataFrame
    df = pd.DataFrame(results)
    
    # 重新排列列的顺序，确保重要信息在前面
    column_order = [
        '大学英文名称', '学位', '专业英文名称', '所属院系', 
        '所属院系（英文）', '所属院系网址',
        '招生网址', '专业网址',
        'valid_response_count', 'consistency_type', 'consistency_detail',
        'response_1', 'response_1_valid',
        'response_2', 'response_2_valid', 
        'response_3', 'response_3_valid'
    ]
    
    # 确保所有列都存在
    for col in column_order:
        if col not in df.columns:
            df[col] = ''
    
    # 按指定顺序重排列
    df = df[column_order]
    
    # 保存为CSV
    df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
    
    # 打印统计信息
    print(f"数据处理完成！")
    print(f"总共处理了 {len(df)} 条记录")
    print(f"有效回答统计:")
    valid_counts = df['valid_response_count'].value_counts().sort_index()
    for count, records in valid_counts.items():
        print(f"  {count}个有效回答: {records} 条记录")
    print(f"一致性统计:")
    consistency_counts = df['consistency_type'].value_counts()
    for consistency_type, count in consistency_counts.items():
        print(f"  {consistency_type}: {count} 条")
    
    # 统计成功提取的院系信息
    successful_extractions = len(df[df['所属院系（英文）'] != ''])
    print(f"成功提取院系信息: {successful_extractions}/{len(df)} ({successful_extractions/len(df):.1%})")
    
    print(f"数据已保存到: {output_csv_path}")
    
    return df

def is_valid_response(response_text):
    """
    检查回答是否符合"school, url"格式
    
    Args:
        response_text: LLM的回答文本
    
    Returns:
        bool: 是否为合法格式
    """
    
    if not response_text or not response_text.strip():
        return False
    
    # 去除首尾空白
    text = response_text.strip()
    
    # 检查是否包含逗号
    if ',' not in text:
        return False
    
    # 检查长度（过长的回答通常是解释性文本）
    if len(text) > 200:  # 设置最大长度限制
        return False
    
    # 检查是否包含URL
    url_pattern = r'https?://[^\s,]+'
    if not re.search(url_pattern, text):
        return False
    
    # 检查是否包含某些无效关键词
    invalid_keywords = [
        'not found', 'not available', 'unable to find', 'could not find',
        'no information', 'not mentioned', 'unclear', 'not specified',
        'the browse results', 'clearly indicate', 'according to'
    ]
    
    text_lower = text.lower()
    for keyword in invalid_keywords:
        if keyword in text_lower:
            return False
    
    # 分割并检查格式
    parts = text.split(',')
    if len(parts) < 2:
        return False
    
    # 第一部分应该是学院名称（不应该太短或包含特殊词汇）
    school_part = parts[0].strip()
    if len(school_part) < 5:  # 学院名称不应该太短
        return False
    
    # 第二部分应该包含URL
    url_part = ','.join(parts[1:]).strip()
    if not re.search(url_pattern, url_part):
        return False
    
    return True

def select_best_response(response_texts):
    """
    根据一致性选择最佳回答（只考虑合法格式的回答）
    
    Args:
        response_texts: 三个LLM回答的列表
    
    Returns:
        tuple: (选中的回答, 一致性信息, 有效回答列表)
    """
    
    # 过滤出符合格式的有效回答
    valid_responses = [resp for resp in response_texts if is_valid_response(resp)]
    
    if not valid_responses:
        return '', {'type': 'no_valid_response', 'detail': 'No valid responses found'}, []
    
    if len(valid_responses) == 1:
        return valid_responses[0], {'type': 'single_valid_response', 'detail': 'Only one valid response'}, valid_responses
    
    # 统计每个有效回答出现的次数
    response_counts = Counter(valid_responses)
    most_common = response_counts.most_common()
    
    # 完全一致（所有有效回答都相同）
    if len(most_common) == 1:
        return most_common[0][0], {'type': 'all_valid_same', 'detail': f'All {most_common[0][1]} valid responses are identical'}, valid_responses
    
    # 两个或更多相同，选择出现次数最多的
    if most_common[0][1] >= 2:
        return most_common[0][0], {'type': 'majority_valid_same', 'detail': f'{most_common[0][1]} valid responses are identical'}, valid_responses
    
    # 所有有效回答都不同，随机选择一个
    selected = random.choice(valid_responses)
    return selected, {'type': 'all_valid_different', 'detail': 'All valid responses are different, randomly selected'}, valid_responses

def parse_response_text(response_text):
    """
    解析回答文本，提取院系名称和URL
    
    Args:
        response_text: LLM的回答文本
    
    Returns:
        tuple: (院系名称, URL)
    """
    
    if not response_text.strip():
        return '', ''
    
    # 按逗号分割
    parts = response_text.split(',')
    
    if len(parts) >= 2:
        department_name = parts[0].strip()
        
        # 在第二部分及之后寻找URL
        url_part = ','.join(parts[1:]).strip()
        
        # 提取URL
        url_pattern = r'https?://[^\s,]+'
        url_match = re.search(url_pattern, url_part)
        
        if url_match:
            department_url = url_match.group()
            # 清理URL末尾的标点符号
            department_url = re.sub(r'[,\.\s]+$', '', department_url)
        else:
            department_url = ''
    
    else:
        # 如果没有逗号，整个文本作为院系名称
        department_name = response_text.strip()
        department_url = ''
    
    return department_name, department_url


In [11]:
json_file = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/所属院系（英文）/所属院系（英文）_gemini-2.5-flash_0_-1.json"
output_csv = "所属院系英文_处理结果.csv"

df = extract_department_from_json(json_file, output_csv)

# 显示前几条记录
print("\n前5条记录预览：")
display_columns = ['大学英文名称', '专业英文名称', '所属院系（英文）', '所属院系网址', 'valid_response_count', 'consistency_type']
print(df[display_columns].head())

# 显示成功提取的示例
print(f"\n=== 成功提取院系信息的示例 ===")
successful = df[df['所属院系（英文）'] != ''].head(5)
for _, row in successful.iterrows():
    print(f"大学: {row['大学英文名称']}")
    print(f"专业: {row['专业英文名称']}")
    print(f"原始院系: {row['所属院系']}")
    print(f"英文院系: {row['所属院系（英文）']}")
    print(f"院系网址: {row['所属院系网址']}")
    print(f"有效回答数: {row['valid_response_count']}")
    print(f"一致性: {row['consistency_type']}")
    print("-" * 50)

# 显示失败的示例
print(f"\n=== 未能提取院系信息的示例 ===")
failed = df[df['所属院系（英文）'] == ''].head(3)
for _, row in failed.iterrows():
    print(f"大学: {row['大学英文名称']}")
    print(f"专业: {row['专业英文名称']}")
    print(f"原始院系: {row['所属院系']}")
    print(f"回答1: {row['response_1']} (有效: {row['response_1_valid']})")
    print(f"回答2: {row['response_2']} (有效: {row['response_2_valid']})")
    print(f"回答3: {row['response_3']} (有效: {row['response_3_valid']})")
    print(f"原因: {row['consistency_detail']}")
    print("-" * 50)

# 验证数据完整性
print(f"\n=== 数据完整性检查 ===")
has_department_name = len(df[df['所属院系（英文）'] != ''])
has_department_url = len(df[df['所属院系网址'] != ''])
has_both = len(df[(df['所属院系（英文）'] != '') & (df['所属院系网址'] != '')])

print(f"有院系名称的记录: {has_department_name}/{len(df)}")
print(f"有院系网址的记录: {has_department_url}/{len(df)}")
print(f"同时有名称和网址的记录: {has_both}/{len(df)}")

数据处理完成！
总共处理了 350 条记录
有效回答统计:
  0个有效回答: 3 条记录
  1个有效回答: 18 条记录
  2个有效回答: 74 条记录
  3个有效回答: 255 条记录
一致性统计:
  all_valid_same: 144 条
  majority_valid_same: 110 条
  all_valid_different: 75 条
  single_valid_response: 18 条
  no_valid_response: 3 条
成功提取院系信息: 347/350 (99.1%)
数据已保存到: 所属院系英文_处理结果.csv

前5条记录预览：
                大学英文名称                       专业英文名称  \
0   Harvard University            Data Science SEAS   
1   Harvard University                 Data Science   
2  Stanford University                 Data Science   
3  Stanford University                 Data Science   
4      Yale University  Statistics and Data Science   

                                            所属院系（英文）  \
0  Harvard John A. Paulson School of Engineering ...   
1                           Harvard Extension School   
2                              School of Engineering   
3                  School of Humanities and Sciences   
4               Graduate School of Arts and Sciences   

                             所属