In [1]:
import pandas as pd
import sys
import os
import json
from tqdm import tqdm
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/AppliedDataScience')
import asyncio
from call_api import call_gemini, async_call_gemini
from tqdm.asyncio import tqdm_asyncio

field_name = "项目标签"
field_path = f"/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/AppliedDataScience/fields_csv/{field_name}.csv"
field_df = pd.read_csv(field_path)

field_df.shape

(121, 9)

In [2]:
prompt_template = """
Your task is to summarize the program characteristics of a graduate program from a university based on the content of the provided URL.
First, please carefully access and read the content of the following URL:
<URL>
{admissions_url}
{program_url}
</URL>
When summarizing the program characteristics, focus on the following aspects:
1. Disciplinary characteristics: Identify the main disciplines involved in the program, such as math, statistics, data science.
2. Interdisciplinary nature: Determine whether the program is an interdisciplinary program.
When extracting information from the URL content, be thorough and precise. One program can have multiple characteristics. For each program, you should return no more than 5 characteristics. 
If the program urls provides more than 5 keywords, you should return the most important 5 keywords.

Please output the identified characteristics as tags. For example, if the program involves math and statistics, is interdisciplinary, the output could be:

<Tag>math</Tag>
<Tag>statistics</Tag>
<Tag>interdisciplinary</Tag>

Start summarizing the program characteristics now. Remember to output the characteristics in the <Tag> tags and return nothing else.
"""

In [3]:
import os
import json
import asyncio
from tqdm.asyncio import tqdm_asyncio

# Async Gemini wrapper
from call_api import async_call_gemini

# ---------------------------------------------------------------------------
# Concurrency guard – avoid hitting rate-limits
# ---------------------------------------------------------------------------
semaphore = asyncio.Semaphore(5)            # max concurrent rows

# ---------------------------------------------------------------------------
# Per-row worker
# ---------------------------------------------------------------------------
async def process_row(row, prompt_template, num_vote: int, model_name: str):
    """
    1. Format the prompt for this row
    2. Launch `num_vote` Gemini calls in parallel
    3. Capture BOTH normal answers *and* every possible error case
    4. Return a serialisable record
    """
    async with semaphore:
        row    = row.to_dict()
        prompt = prompt_template.format(
            university     = row["大学英文名称"],
            degree         = row["学位"],
            program        = row["专业英文名称"],
            department     = row["所属院系（英文）"],
            admissions_url = row["招生网址"],
            program_url    = row["专业网址"],
        )

        record: dict = row.copy()
        record["llm_reponses"] = {}

        # -------- launch Gemini calls in parallel --------------------
        tasks = [
            async_call_gemini(
                prompt,
                model_name=model_name,
                use_search=True,
                url_context=True
            )
            for _ in range(num_vote)
        ]
        responses = await asyncio.gather(*tasks)

        # -------- post-process each response -------------------------
        for i, response in enumerate(responses):
            resp_key = f"response {i+1}"

            # -- 1. Transport / server-side errors (string starting "Error:")
            if isinstance(response, str) and response.startswith("Error:"):
                record["llm_reponses"][resp_key] = {
                    "error": response                       # e.g. "Error: 429 Rate limit …"
                }
                continue

            # -- 2. Empty / malformed response objects
            if not hasattr(response, "candidates") or not response.candidates:
                record["llm_reponses"][resp_key] = {
                    "error": "No candidates returned",
                    "raw_response": str(response)
                }
                continue

            # -- 3. Extract main answer text
            try:
                text = response.candidates[0].content.parts[0].text
            except Exception as e:
                record["llm_reponses"][resp_key] = {
                    "error": f"Cannot parse text: {e}",
                    "raw_response": str(response)
                }
                continue

            # -- 4. Extract additional metadata (best-effort)
            try:
                url_context = str(response.candidates[0].url_context_metadata)
            except Exception:
                url_context = "Not used"

            try:
                search_pages = (
                    f"Search Chunks: "
                    f"{response.candidates[0].grounding_metadata.grounding_chunks}"
                )
            except Exception:
                search_pages = "Not used"

            try:
                search_queries = (
                    f"Search Query: "
                    f"{response.candidates[0].grounding_metadata.web_search_queries}"
                )
            except Exception:
                search_queries = "Not used"

            try:
                search_support = (
                    f"Search Supports: "
                    f"{response.candidates[0].grounding_metadata.groundingSupports}"
                )
            except Exception:
                search_support = "Not used"

            # -- 5. Store normal answer + metadata + raw object
            record["llm_reponses"][resp_key] = {
                "response_text": text,
                "url_context": url_context,
                "search_queries": search_queries,
                "search_pages": search_pages,
                "search_support": search_support,
                "raw_response": str(response)             # keep for deep-debugging
            }

        return record

# ---------------------------------------------------------------------------
# Batch orchestrator with tqdm progress bar
# ---------------------------------------------------------------------------
async def request_and_store_async(prompt_template,
                                  field_df,
                                  num_vote: int,
                                  model_name: str,
                                  start_from: int = 0,
                                  end_at: int = -1):
    """
    Runs `process_row` over the dataframe slice asynchronously,
    shows a live tqdm bar, and dumps the results to JSON.
    """
    df = field_df.copy()[start_from:end_at]

    # Spawn tasks for every row in the slice
    tasks = [
        process_row(row, prompt_template, num_vote, model_name)
        for _, row in df.iterrows()
    ]

    # tqdm_asyncio.gather gives us progress updates as tasks complete
    response_records = await tqdm_asyncio.gather(*tasks)

    # Persist to disk ------------------------------------------------
    output_dir = f"../fields_records/{field_name}"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/{field_name}_{model_name}_{start_from}_{end_at}.json"

    with open(output_path, "w") as f:
        json.dump(response_records, f, ensure_ascii=False, indent=2)

    return response_records

In [5]:
import nest_asyncio
nest_asyncio.apply()  # Only needed in Jupyter

num_vote = 3
start_from = 0
end_at = len(field_df)
model_name = "gemini-2.5-flash"
response_records = asyncio.run(
    request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=start_from, end_at=end_at)
)

100%|██████████| 121/121 [07:10<00:00,  3.56s/it]


In [6]:
json_file_path = '/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/AppliedDataScience/fields_records/项目标签/项目标签_gemini-2.5-flash_0_121.json'
output_csv_path = '/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/AppliedDataScience/fields_records/项目标签/项目标签_gemini-2.5-flash.csv'

In [7]:
import json
import re
import pandas as pd

def extract_project_tags(json_file_path, output_csv_path):
    """
    Extract project tags from JSON file with fallback logic for invalid responses.
    
    Args:
        json_file_path (str): Path to the JSON file containing project tag data
        output_csv_path (str): Path to save the output CSV file
    
    Returns:
        pd.DataFrame: DataFrame with extracted results and statistics
    """
    
    def is_valid_response(response_text):
        """Check if a response is valid (contains proper <Tag> format)"""
        if not response_text or response_text.strip() == "":
            return False
        if "not found" in response_text.lower():
            return False
        if len(response_text) > 1000:  # Too long response
            return False
        
        # Check if contains proper <Tag> format
        tag_pattern = r'<[Tt]ag>([^<]+)</[Tt]ag>'
        tags = re.findall(tag_pattern, response_text)
        return len(tags) > 0
    
    def extract_tags_from_response(response_text):
        """Extract tags from a valid response"""
        tag_pattern = r'<[Tt]ag>([^<]+)</[Tt]ag>'
        tags = re.findall(tag_pattern, response_text)
        # Clean and format tags
        cleaned_tags = [tag.strip().title() for tag in tags if tag.strip()]
        return ", ".join(cleaned_tags)
    
    # Load JSON data
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    results = []
    stats = {
        'total_programs': 0,
        'first_response_valid': 0,
        'second_response_used': 0,
        'third_response_used': 0,
        'need_confirmation': 0
    }
    
    for program in data:
        stats['total_programs'] += 1
        
        # Extract basic info
        university = program.get('大学英文名称', '')
        degree = program.get('学位', '')
        major = program.get('专业英文名称', '')
        school = program.get('所属院系', '')
        
        # Extract responses
        llm_responses = program.get('llm_reponses', {})
        
        # Try responses in order: response 1, response 2, response 3
        final_tags = "需要额外确认"
        response_used = "none"
        
        for i in range(1, 4):
            response_key = f"response {i}"
            if response_key in llm_responses:
                response_text = llm_responses[response_key].get('response_text', '')
                
                if is_valid_response(response_text):
                    final_tags = extract_tags_from_response(response_text)
                    response_used = f"response_{i}"
                    
                    if i == 1:
                        stats['first_response_valid'] += 1
                    elif i == 2:
                        stats['second_response_used'] += 1
                    elif i == 3:
                        stats['third_response_used'] += 1
                    break
        
        if final_tags == "需要额外确认":
            stats['need_confirmation'] += 1
        
        # Store result
        results.append({
            '大学英文名称': university,
            '学位': degree,
            '专业英文名称': major,
            '所属院系': school,
            '项目标签': final_tags,
            'response_used': response_used
        })
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Save to CSV
    df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
    
    # Print statistics
    print("项目标签提取统计:")
    print(f"总项目数: {stats['total_programs']}")
    print(f"第一次回答有效: {stats['first_response_valid']} ({stats['first_response_valid']/stats['total_programs']*100:.1f}%)")
    print(f"使用第二次回答: {stats['second_response_used']} ({stats['second_response_used']/stats['total_programs']*100:.1f}%)")
    print(f"使用第三次回答: {stats['third_response_used']} ({stats['third_response_used']/stats['total_programs']*100:.1f}%)")
    print(f"需要额外确认: {stats['need_confirmation']} ({stats['need_confirmation']/stats['total_programs']*100:.1f}%)")
    
    return df


df = extract_project_tags(
    json_file_path,
    output_csv_path
)

项目标签提取统计:
总项目数: 121
第一次回答有效: 107 (88.4%)
使用第二次回答: 8 (6.6%)
使用第三次回答: 1 (0.8%)
需要额外确认: 5 (4.1%)


In [2]:
csv_file = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/AppliedDataScience/fields_records/项目标签/项目标签_gemini-2.5-flash.csv"
json_file = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/AppliedDataScience/fields_notebook/tags.json"
output_file = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/AppliedDataScience/fields_records/项目标签/项目标签_gemini-2.5-flash.csv"

In [3]:
import pandas as pd
import json

def check_missing_tags(csv_file_path, tags_json_path):
    """
    检查CSV中的标签是否都在翻译字典中，打印缺失的标签
    
    Args:
        csv_file_path: CSV文件路径
        tags_json_path: 翻译字典JSON文件路径
    """
    
    # 读取翻译字典
    with open(tags_json_path, 'r', encoding='utf-8') as f:
        translation_dict = json.load(f)[0]
    
    # 读取CSV文件
    df = pd.read_csv(csv_file_path, encoding='utf-8-sig')
    
    # 收集所有unique标签
    all_tags = set()
    for tags_str in df['项目标签（英文）']:
        if pd.notna(tags_str) and tags_str != '' and tags_str != '需要额外确认':
            tag_list = [tag.strip() for tag in str(tags_str).split(',')]
            all_tags.update(tag_list)
    
    # 找出不在翻译字典中的标签
    missing_tags = [tag for tag in all_tags if tag not in translation_dict and tag != '']
    
    if missing_tags:
        print("以下标签不存在于翻译字典中，请添加中英文翻译，并返回一个英文：中文的字典：")
        for tag in sorted(missing_tags):
            print(f"  - {tag}")

# 使用示例
check_missing_tags(
 csv_file, json_file
)

In [5]:
import pandas as pd
import json
import os

def process_project_tags_bilingual(csv_file_path, tags_json_path, output_csv_path):
    """
    处理项目标签，将英文标签转换为中英双语标签
    
    Args:
        csv_file_path: 输入CSV文件路径
        tags_json_path: 英中翻译字典JSON文件路径
        output_csv_path: 输出CSV文件路径
    
    Returns:
        DataFrame: 处理后的数据
    """
    
    # 1. 读取翻译字典
    try:
        with open(tags_json_path, 'r', encoding='utf-8') as f:
            translation_dict = json.load(f)[0]  # JSON文件中是一个包含字典的数组
        print(f"成功加载翻译字典，包含 {len(translation_dict)} 个翻译条目")
    except Exception as e:
        print(f"读取翻译字典失败: {e}")
        return None
    
    # 2. 读取CSV文件
    try:
        df = pd.read_csv(csv_file_path, encoding='utf-8-sig')
        print(f"成功读取CSV文件，包含 {len(df)} 行数据")
    except Exception as e:
        print(f"读取CSV文件失败: {e}")
        return None
    
    # 3. 检查必要的列是否存在
    if '项目标签' not in df.columns:
        print("错误：CSV文件中未找到'项目标签'列")
        return None
    
    # 4. 重命名现有的项目标签列
    df = df.rename(columns={'项目标签': '项目标签（英文）'})
    print("已将'项目标签'列重命名为'项目标签（英文）'")
    
    # 5. 创建翻译函数
    def translate_tags(english_tags):
        """
        将英文标签字符串转换为中文标签字符串
        
        Args:
            english_tags: 英文标签字符串，用逗号分隔
            
        Returns:
            str: 中文标签字符串，用逗号分隔
        """
        if pd.isna(english_tags) or english_tags == '' or english_tags == '需要额外确认':
            return english_tags  # 保持原值
        
        # 分割标签并清理空白
        tag_list = [tag.strip() for tag in str(english_tags).split(',')]
        translated_tags = []
        untranslated_tags = []
        
        for tag in tag_list:
            if tag in translation_dict:
                translated_tags.append(translation_dict[tag])
            else:
                # 如果没有找到翻译，保留原英文
                translated_tags.append(tag)
                if tag != '需要额外确认' and tag != '':
                    untranslated_tags.append(tag)
        
        # 如果有未翻译的标签，记录下来
        if untranslated_tags:
            print(f"未找到翻译的标签: {untranslated_tags}")
        
        return ', '.join(translated_tags)
    
    # 6. 应用翻译函数创建中文标签列
    print("正在翻译项目标签...")
    df['项目标签（中文）'] = df['项目标签（英文）'].apply(translate_tags)
    
    # 7. 重新排列列的顺序，将双语标签列放在一起
    # 找到原来项目标签列的位置
    columns = df.columns.tolist()
    english_tag_index = columns.index('项目标签（英文）')
    
    # 重新排列列顺序
    new_columns = (columns[:english_tag_index] + 
                  ['项目标签（英文）', '项目标签（中文）'] + 
                  columns[english_tag_index+1:])
    
    # 移除重复的'项目标签（中文）'列（如果有的话）
    new_columns = [col for i, col in enumerate(new_columns) 
                   if col != '项目标签（中文）' or i == english_tag_index + 1]
    
    df = df[new_columns]
    
    # 8. 创建输出目录（如果不存在）
    output_dir = os.path.dirname(output_csv_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)
        print(f"创建输出目录: {output_dir}")
    
    # 9. 保存处理后的CSV文件
    try:
        df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
        print(f"成功保存双语标签CSV文件到: {output_csv_path}")
    except Exception as e:
        print(f"保存CSV文件失败: {e}")
        return None
    
    # 10. 打印统计信息
    print("\n=== 处理统计 ===")
    total_records = len(df)
    english_filled = len(df[df['项目标签（英文）'].notna() & 
                           (df['项目标签（英文）'] != '') & 
                           (df['项目标签（英文）'] != '需要额外确认')])
    chinese_filled = len(df[df['项目标签（中文）'].notna() & 
                           (df['项目标签（中文）'] != '') & 
                           (df['项目标签（中文）'] != '需要额外确认')])
    
    print(f"总记录数: {total_records}")
    print(f"有英文标签的记录: {english_filled} ({english_filled/total_records:.1%})")
    print(f"有中文标签的记录: {chinese_filled} ({chinese_filled/total_records:.1%})")
    
    # 11. 显示前几条记录作为示例
    print("\n=== 前5条记录示例 ===")
    display_columns = ['大学英文名称', '专业英文名称', '项目标签（英文）', '项目标签（中文）']
    available_columns = [col for col in display_columns if col in df.columns]
    print(df[available_columns].head().to_string(index=False))
    
    # 12. 检查是否有需要额外确认的记录
    need_confirmation = df[df['项目标签（英文）'] == '需要额外确认']
    if len(need_confirmation) > 0:
        print(f"\n=== 需要额外确认的记录 ({len(need_confirmation)} 条) ===")
        for _, row in need_confirmation.iterrows():
            print(f"- {row['大学英文名称']}: {row['专业英文名称']}")
    
    return df

# 使用示例
if __name__ == "__main__":
    
    
    # 执行处理
    result_df = process_project_tags_bilingual(csv_file, json_file, output_file)
    
    if result_df is not None:
        print("\n✅ 项目标签双语化处理完成！")
    else:
        print("\n❌ 处理失败，请检查错误信息")

成功加载翻译字典，包含 323 个翻译条目
成功读取CSV文件，包含 121 行数据
已将'项目标签'列重命名为'项目标签（英文）'
正在翻译项目标签...
成功保存双语标签CSV文件到: /Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/AppliedDataScience/fields_records/项目标签/项目标签_gemini-2.5-flash.csv

=== 处理统计 ===
总记录数: 121
有英文标签的记录: 116 (95.9%)
有中文标签的记录: 116 (95.9%)

=== 前5条记录示例 ===
                               大学英文名称                                                                                专业英文名称                                                                                             项目标签（英文）                    项目标签（中文）
Massachusetts Institute of Technology Civil and Environmental Engineering：Data Science for Engineering Systems (DSES) track Civil Engineering, Environmental Engineering, Interdisciplinary, Data Science, Computational Science 土木工程, 环境工程, 跨学科, 数据科学, 计算科学
Massachusetts Institute of Technology                                                        Social and Engineering Systems                            Interdisciplinary, Data Sci