In [2]:
import pandas as pd
import sys
import os
import json
from tqdm import tqdm
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience')
import asyncio
from call_api import call_gemini, async_call_gemini
from tqdm.asyncio import tqdm_asyncio

field_name = "项目标签"
field_path = f"/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_csv/{field_name}.csv"
field_df = pd.read_csv(field_path)

field_df.shape

(351, 7)

In [6]:
prompt_template = """
Your task is to summarize the program characteristics of a graduate program from a university based on the content of the provided URL.
First, please carefully access and read the content of the following URL:
<URL>
{admissions_url}
{program_url}
</URL>
When summarizing the program characteristics, focus on the following aspects:
1. Disciplinary characteristics: Identify the main disciplines involved in the program, such as math, statistics, data science.
2. Interdisciplinary nature: Determine whether the program is an interdisciplinary program.
When extracting information from the URL content, be thorough and precise. One program can have multiple characteristics. For each program, you should return no more than 5 characteristics. 
If the program urls provides more than 5 keywords, you should return the most important 5 keywords.

Please output the identified characteristics as tags. For example, if the program involves math and statistics, is interdisciplinary, the output could be:

<Tag>math</Tag>
<Tag>statistics</Tag>
<Tag>interdisciplinary</Tag>

Start summarizing the program characteristics now. Remember to output the characteristics in the <Tag> tags and return nothing else.
"""

In [7]:
semaphore = asyncio.Semaphore(3) 

async def process_row(row, prompt_template, num_vote, model_name):
    async with semaphore:
        row = row.to_dict()
        prompt = prompt_template.format(
            university     = row["大学英文名称"],
            degree         = row["学位"],
            program        = row["专业英文名称"],
            department     = row["所属院系"],
            admissions_url = row["招生网址"],
            program_url    = row["专业网址"],
        )
        record = row.copy()
        record["llm_reponses"] = {}

        # Launch all API calls in parallel for this row
        tasks = [
            async_call_gemini(prompt, model_name=model_name, use_search=True, url_context=True)
            for _ in range(num_vote)
        ]
        responses = await asyncio.gather(*tasks)

        for i, response in enumerate(responses):
            try:
                text = response.candidates[0].content.parts[0].text
            except:
                text = ''
            try:
                url_context = str(response.candidates[0].url_context_metadata)
            except:
                url_context = "Not used"
            try:
                search_pages = f"Search Chunks: {response.candidates[0].grounding_metadata.grounding_chunks}"
            except:
                search_pages = "Not used"
            try:
                search_queries = f"Search Query: {response.candidates[0].grounding_metadata.web_search_queries}"
            except:
                search_queries = "Not used"
            try:
                search_support = f"Search Query: {response.candidates[0].grounding_metadata.groundingSupports}"
            except:
                search_support = "Not used"

            record["llm_reponses"][f"response {i+1}"] = {
                "response_text": text,
                "url_context": url_context,
                "search_queries": search_queries,
                "search_pages": search_pages,
                "search_support": search_support,
            }
        return record

async def request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=0, end_at=-1):
    df = field_df.copy()[start_from:end_at]
    response_records = []

    # Create tasks for all rows
    tasks = [
        process_row(row, prompt_template, num_vote, model_name)
        for _, row in df.iterrows()
    ]
    # Run all row tasks in parallel (limit concurrency if needed)
    response_records = await tqdm_asyncio.gather(*tasks)

    # Save results
    with open(f"../fields_records/{field_name}/{field_name}_{model_name}_{start_from}_{end_at}.json", "w") as f:
        json.dump(response_records, f, ensure_ascii=False, indent=2)
    return response_records

In [9]:
import nest_asyncio
nest_asyncio.apply()  # Only needed in Jupyter

num_vote = 3
start_from = 0
end_at = -1
model_name = "gemini-2.5-flash"
response_records = asyncio.run(
    request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=start_from, end_at=end_at)
)

100%|██████████| 350/350 [28:09<00:00,  4.83s/it]


In [2]:
import json
import re
import pandas as pd

def extract_project_tags(json_file_path, output_csv_path):
    """
    Extract project tags from JSON file with fallback logic for invalid responses.
    
    Args:
        json_file_path (str): Path to the JSON file containing project tag data
        output_csv_path (str): Path to save the output CSV file
    
    Returns:
        pd.DataFrame: DataFrame with extracted results and statistics
    """
    
    def is_valid_response(response_text):
        """Check if a response is valid (contains proper <Tag> format)"""
        if not response_text or response_text.strip() == "":
            return False
        if "not found" in response_text.lower():
            return False
        if len(response_text) > 1000:  # Too long response
            return False
        
        # Check if contains proper <Tag> format
        tag_pattern = r'<[Tt]ag>([^<]+)</[Tt]ag>'
        tags = re.findall(tag_pattern, response_text)
        return len(tags) > 0
    
    def extract_tags_from_response(response_text):
        """Extract tags from a valid response"""
        tag_pattern = r'<[Tt]ag>([^<]+)</[Tt]ag>'
        tags = re.findall(tag_pattern, response_text)
        # Clean and format tags
        cleaned_tags = [tag.strip().title() for tag in tags if tag.strip()]
        return ", ".join(cleaned_tags)
    
    # Load JSON data
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    results = []
    stats = {
        'total_programs': 0,
        'first_response_valid': 0,
        'second_response_used': 0,
        'third_response_used': 0,
        'need_confirmation': 0
    }
    
    for program in data:
        stats['total_programs'] += 1
        
        # Extract basic info
        university = program.get('大学英文名称', '')
        degree = program.get('学位', '')
        major = program.get('专业英文名称', '')
        school = program.get('所属院系', '')
        
        # Extract responses
        llm_responses = program.get('llm_reponses', {})
        
        # Try responses in order: response 1, response 2, response 3
        final_tags = "需要额外确认"
        response_used = "none"
        
        for i in range(1, 4):
            response_key = f"response {i}"
            if response_key in llm_responses:
                response_text = llm_responses[response_key].get('response_text', '')
                
                if is_valid_response(response_text):
                    final_tags = extract_tags_from_response(response_text)
                    response_used = f"response_{i}"
                    
                    if i == 1:
                        stats['first_response_valid'] += 1
                    elif i == 2:
                        stats['second_response_used'] += 1
                    elif i == 3:
                        stats['third_response_used'] += 1
                    break
        
        if final_tags == "需要额外确认":
            stats['need_confirmation'] += 1
        
        # Store result
        results.append({
            '大学英文名称': university,
            '学位': degree,
            '专业英文名称': major,
            '所属院系': school,
            '项目标签': final_tags,
            'response_used': response_used
        })
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Save to CSV
    df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
    
    # Print statistics
    print("项目标签提取统计:")
    print(f"总项目数: {stats['total_programs']}")
    print(f"第一次回答有效: {stats['first_response_valid']} ({stats['first_response_valid']/stats['total_programs']*100:.1f}%)")
    print(f"使用第二次回答: {stats['second_response_used']} ({stats['second_response_used']/stats['total_programs']*100:.1f}%)")
    print(f"使用第三次回答: {stats['third_response_used']} ({stats['third_response_used']/stats['total_programs']*100:.1f}%)")
    print(f"需要额外确认: {stats['need_confirmation']} ({stats['need_confirmation']/stats['total_programs']*100:.1f}%)")
    
    return df


df = extract_project_tags(
    '/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/项目标签/项目标签_gemini-2.5-flash_0_-1.json',
    '/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/项目标签/项目标签.csv'
)

项目标签提取统计:
总项目数: 350
第一次回答有效: 304 (86.9%)
使用第二次回答: 29 (8.3%)
使用第三次回答: 10 (2.9%)
需要额外确认: 7 (2.0%)


In [5]:
import pandas as pd
import re
from collections import Counter

def analyze_and_translate_project_tags(csv_file_path, output_csv_path):
    """
    Analyze project tags, create Chinese translations, and update CSV with bilingual tags.
    
    Args:
        csv_file_path (str): Path to the input CSV file
        output_csv_path (str): Path to save the updated CSV file
    
    Returns:
        dict: Statistics and translation mapping
    """
    
    # Read the CSV file
    df = pd.read_csv(csv_file_path, encoding='utf-8-sig')
    
    # Extract all unique tags
    all_tags = []
    for tags_str in df['项目标签']:
        if pd.notna(tags_str) and tags_str != "需要额外确认":
            # Split by comma and clean up
            tags = [tag.strip() for tag in tags_str.split(',')]
            all_tags.extend(tags)
    
    # Count frequency of each tag
    tag_counts = Counter(all_tags)
    unique_tags = sorted(tag_counts.keys())
    
    print(f"发现 {len(unique_tags)} 个不同的标签")
    print(f"标签总出现次数: {sum(tag_counts.values())}")
    
    # Create Chinese translation dictionary (updated with missing tags)
    english_to_chinese = {
        # Core Data Science & Analytics
        "Data Science": "数据科学",
        "Data Analytics": "数据分析",
        "Analytics": "分析学",
        "Applied Analytics": "应用分析学",
        "Business Analytics": "商业分析",
        "Advanced Analytics": "高级分析",
        "Data Analysis": "数据分析",
        "Big Data Analytics": "大数据分析",
        "Big Data Analysis": "大数据分析",
        "Big Data": "大数据",
        "Business Data Analytics": "商业数据分析",
        "Business Data Science": "商业数据科学",
        "Quantitative Analytics": "定量分析",
        "Quantitative Analysis": "定量分析",
        "Applied Data Science": "应用数据科学",
        "Statistical Data Science": "统计数据科学",
        "Data Sciences": "数据科学",
        "Data Science And Analytics": "数据科学与分析",
        "Analytics Techniques": "分析技术",
        
        # Mathematics & Statistics
        "Mathematics": "数学",
        "Statistics": "统计学",
        "Applied Mathematics": "应用数学",
        "Applied Statistics": "应用统计学",
        "Mathematical Statistics": "数理统计学",
        "Computational Mathematics": "计算数学",
        "Pure Mathematics": "纯数学",
        "Applied And Computational Mathematics": "应用与计算数学",
        "Financial Mathematics": "金融数学",
        "Biomathematics": "生物数学",
        "Mathematical Data Science": "数学数据科学",
        "Mathematical Sciences": "数学科学",
        "Computational Statistics": "计算统计学",
        "Applied Probability": "应用概率论",
        "Probability Theory": "概率论",
        "Mathematical Foundations": "数学基础",
        "Statistical Thinking": "统计思维",
        "Biostatistics": "生物统计学",
        "Mathematical Models": "数学模型",
        "Statistical Modeling": "统计建模",
        "Statistical Machine Learning": "统计机器学习",
        "Mathematical Modeling": "数学建模",
        "Quantitative Theory And Methods": "定量理论与方法",
        "Quantitative Methods": "定量方法",
        "Quantitative Techniques": "定量技术",
        "Quantitative": "定量",
        "Statistical Practice": "统计实践",
        "Mathematical & Computational Finance": "数学与计算金融",
        "Financial Engineering": "金融工程",
        "Actuarial Science": "精算科学",
        "Predictive Analytics": "预测分析",
        "Mathematics/Statistics": "数学/统计学",
        "Statistics/Mathematics": "统计学/数学",
        "Statistical": "统计的",
        "Statistical Science": "统计科学",
        "Statistical Inference": "统计推断",
        "Statistical Theory And Methodologies": "统计理论与方法",
        
        # Computer Science & Engineering
        "Computer Science": "计算机科学",
        "Computer Sciences": "计算机科学",
        "Computer Engineering": "计算机工程",
        "Electrical Engineering": "电气工程",
        "Electrical And Computer Engineering": "电气与计算机工程",
        "Signal And Information Processing": "信号与信息处理",
        "Software Engineering": "软件工程",
        "Software Development": "软件开发",
        "Software Design And Engineering": "软件设计与工程",
        "Computer Programming": "计算机编程",
        "Programming": "编程",
        "Computing": "计算",
        "Computational": "计算的",
        "Computational Science": "计算科学",
        "Computational Science And Engineering": "计算科学与工程",
        "Computational Engineering": "计算工程",
        "Computational Data Science": "计算数据科学",
        "Computational Analysis": "计算分析",
        "Computational Training": "计算训练",
        "Computational Biology": "计算生物学",
        "Computational & Applied Mathematics": "计算与应用数学",
        "Computational Modeling": "计算建模",
        "Scientific Computing": "科学计算",
        "Applied Computing": "应用计算",
        "Data-Centric Computing": "以数据为中心的计算",
        "Algorithmic": "算法的",
        "Algorithms": "算法",
        "Algorithms And Programming": "算法与编程",
        "Numerical Algorithms": "数值算法",
        "High-Performance Computing": "高性能计算",
        "Computer Information Systems": "计算机信息系统",
        "Computational Skills": "计算技能",
        "Python Programming": "Python编程",
        
        # Machine Learning & AI
        "Machine Learning": "机器学习",
        "Machine Learning/Ai": "机器学习/人工智能",
        "Machine Learning & Big Data": "机器学习与大数据",
        "Artificial Intelligence": "人工智能",
        "Ai/Machine Learning": "人工智能/机器学习",
        "Deep Learning": "深度学习",
        "Natural Language Processing": "自然语言处理",
        "Artificial Intelligence And Machine Learning": "人工智能与机器学习",
        "Ai And Analytics": "人工智能与分析",
        "Machine Learning And Ai Tools": "机器学习与人工智能工具",
        
        # Engineering Disciplines
        "Engineering": "工程学",
        "Industrial Engineering": "工业工程",
        "Industrial And Systems Engineering": "工业与系统工程",
        "Systems Engineering": "系统工程",
        "Civil Engineering": "土木工程",
        "Environmental Engineering": "环境工程",
        "Operations Engineering": "运营工程",
        "Operations Research": "运筹学",
        "Operations Management": "运营管理",
        "Engineering Management": "工程管理",
        "Technical Management": "技术管理",
        "Data Engineering": "数据工程",
        "Data Analytics Engineering": "数据分析工程",
        "Autonomous Systems": "自动化系统",
        "Stochastic Systems": "随机系统",
        "Applied Science": "应用科学",
        "Physical Sciences": "物理科学",
        "Natural Sciences": "自然科学",
        "Engineering Science": "工程科学",
        "Systems Science": "系统科学",
        
        # Information & Technology
        "Information Science": "信息科学",
        "Information Sciences": "信息科学",
        "Information Systems": "信息系统",
        "Information Systems Management": "信息系统管理",
        "Information Management": "信息管理",
        "Information Technology": "信息技术",
        "Information Technology Strategy": "信息技术战略",
        "Information Engineering": "信息工程",
        "Library Science": "图书馆学",
        "Library And Information Science": "图书馆与信息科学",
        "Technology": "技术",
        "Technology Management": "技术管理",
        "Informatics": "信息学",
        "Health Informatics": "健康信息学",
        "Bioinformatics": "生物信息学",
        
        # Data Management & Architecture
        "Data Management": "数据管理",
        "Data Systems": "数据系统",
        "Data Architecture": "数据架构",
        "Database Management": "数据库管理",
        "Big Data Engineering": "大数据工程",
        "Data Mining": "数据挖掘",
        "Data Visualization": "数据可视化",
        "Data Modeling": "数据建模",
        "Data Preparation": "数据准备",
        "Data Curation": "数据管理",
        "Data Intelligence": "数据智能",
        "Visualization": "可视化",
        
        # Business & Management
        "Business": "商业",
        "Business Intelligence": "商业智能",
        "Business Computing": "商业计算",
        "Business Applications": "商业应用",
        "Business Application": "商业应用",
        "Management": "管理学",
        "Management Science": "管理科学",
        "Decision Science": "决策科学",
        "Leadership": "领导力",
        "Teamwork": "团队合作",
        "Business Skills": "商业技能",
        "Applications": "应用",
        
        # Security & Privacy
        "Cybersecurity": "网络安全",
        "Information Security": "信息安全",
        "Cyber Security": "网络安全",
        "Security": "安全",
        "Security Studies": "安全研究",
        "Cyber Defense": "网络防御",
        
        # Human-Computer Interaction
        "Human-Computer Interaction": "人机交互",
        "Human Computer Interaction": "人机交互",
        "Human-Centered Computing": "以人为中心的计算",
        "User Experience (Ux)": "用户体验",
        "Social And Behavioral Aspects": "社会与行为方面",
        "Social Aspects Of Computing": "计算的社会方面",
        "Social Sciences": "社会科学",
        "Social Science Applications": "社会科学应用",
        "Sociotechnical Issues": "社会技术问题",
        
        # Specialized Domains
        "Genomics": "基因组学",
        "Biomedical Sciences": "生物医学科学",
        "Biological Science": "生物科学",
        "Complex Systems": "复杂系统",
        "Optimization": "优化",
        "Quantum Computing": "量子计算",
        "Cloud Computing": "云计算",
        "Cloud Services": "云服务",
        "Infrastructure": "基础设施",
        "Remote Sensing Data": "遥感数据",
        "Imaging Sciences": "成像科学",
        "Computer Graphics": "计算机图形学",
        "Digital Art": "数字艺术",
        "Computational Finance": "计算金融",
        "Biology": "生物学",
        "Chemistry": "化学",
        "Physics": "物理学",
        "Public Health": "公共卫生",
        "Science": "科学",
        "Computational & Data Science": "计算与数据科学",
        
        # Research & Academic
        "Research": "研究",
        "Applied": "应用的",
        "Technical": "技术的",
        "Technical Training": "技术培训",
        "Theoretical Computing": "理论计算",
        "Advanced Computing": "高级计算",
        "Stem": "STEM",
        "Biotechnology": "生物技术",
        "Experiential Learning": "体验式学习",
        "Creative Education": "创意教育",
        "Analytical Skills": "分析技能",
        "Problem-Solving": "问题解决",
        "Practical Application": "实践应用",
        "Applied Research": "应用研究",
        
        # Academic Degrees & Programs (for completeness)
        "Master'S In Computer And Information Science": "计算机与信息科学硕士",
        "Master'S In Bioinformatics": "生物信息学硕士",
        "Master'S In Data Science": "数据科学硕士",
        "Ph.D. In Computer Science": "计算机科学博士",
        "Data Science Certificate": "数据科学证书",
        
        # Interdisciplinary
        "Interdisciplinary": "跨学科",
        
        # Psychology & Economics
        "Psychology": "心理学",
        "Economics": "经济学",
        "Computational Mathematical Sciences": "计算数学科学",
        "International Relations": "国际关系",
        "Political Science": "政治学",
        "Evaluation Science": "评估科学",
        "Qualitative Data Analysis": "定性数据分析",
        "Quantitative Data Analysis": "定量数据分析",
        
        # Specialized Engineering & Science
        "Bayesian Methods": "贝叶斯方法",
        "Software And Networked Systems": "软件与网络系统",
        "Computing Technologies": "计算技术",
        "Computational Technologies": "计算技术",
        "Societal Impact": "社会影响",
        "Design": "设计",
        "Advanced Methods & Data Analysis": "高级方法与数据分析",
        "Computation": "计算",
        "Ethics": "伦理学",
        "Governance": "治理",
        "Laws": "法律",
        "Business Continuity": "业务连续性",
        "Facilities Management": "设施管理",
        "It": "信息技术",
        "Datacenter Systems Engineering": "数据中心系统工程",
        "Multidisciplinary": "多学科"
    }
    
    # Print statistics about tag categories
    print("\n标签分类统计:")
    categories = {
        "数据科学与分析": ["Data Science", "Data Analytics", "Analytics", "Applied Analytics", "Business Analytics"],
        "数学与统计": ["Mathematics", "Statistics", "Applied Mathematics", "Applied Statistics"],
        "计算机科学": ["Computer Science", "Computer Engineering", "Software Engineering"],
        "机器学习与AI": ["Machine Learning", "Artificial Intelligence", "Deep Learning"],
        "工程学": ["Engineering", "Industrial Engineering", "Systems Engineering"],
        "跨学科": ["Interdisciplinary"]
    }
    
    for category, tags in categories.items():
        count = sum(tag_counts.get(tag, 0) for tag in tags)
        print(f"{category}: {count} 次")
    
    # Create new columns for bilingual tags
    def translate_tags(tags_str):
        if pd.isna(tags_str) or tags_str == "需要额外确认":
            return tags_str, tags_str
        
        english_tags = [tag.strip() for tag in tags_str.split(',')]
        chinese_tags = []
        
        for tag in english_tags:
            if tag in english_to_chinese:
                chinese_tags.append(english_to_chinese[tag])
            else:
                # Handle any missing translations
                print(f"警告: 未找到标签 '{tag}' 的中文翻译")
                chinese_tags.append(tag)  # Keep original if no translation
        
        return tags_str, ", ".join(chinese_tags)
    
    # Apply translation
    translations = df['项目标签'].apply(translate_tags)
    df['项目标签（英文）'] = [t[0] for t in translations]
    df['项目标签（中文）'] = [t[1] for t in translations]
    
    # Remove the original column and reorder
    df = df.drop('项目标签', axis=1)
    
    # Reorder columns
    columns_order = ['大学英文名称', '学位', '专业英文名称', '所属院系', '项目标签（英文）', '项目标签（中文）', 'response_used']
    df = df[columns_order]
    
    # Save updated CSV
    df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
    
    # Print summary
    print(f"\n处理完成!")
    print(f"总项目数: {len(df)}")
    print(f"需要额外确认的项目: {sum(df['项目标签（英文）'] == '需要额外确认')}")
    print(f"成功翻译的项目: {sum(df['项目标签（英文）'] != '需要额外确认')}")
    print(f"翻译字典包含 {len(english_to_chinese)} 个标签")
    
    return {
        'unique_tags': unique_tags,
        'tag_counts': tag_counts,
        'translation_dict': english_to_chinese,
        'total_programs': len(df)
    }

# Usage
result = analyze_and_translate_project_tags(
    '/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/项目标签/项目标签.csv',
    '/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/项目标签/项目标签.csv'
)

# Print the most common tags
print("\n最常见的标签 (前20个):")
for tag, count in result['tag_counts'].most_common(20):
    chinese_translation = result['translation_dict'].get(tag, tag)
    print(f"{tag} ({chinese_translation}): {count} 次")

发现 222 个不同的标签
标签总出现次数: 1769

标签分类统计:
数据科学与分析: 351 次
数学与统计: 328 次
计算机科学: 129 次
机器学习与AI: 128 次
工程学: 35 次
跨学科: 337 次

处理完成!
总项目数: 350
需要额外确认的项目: 7
成功翻译的项目: 343
翻译字典包含 237 个标签

最常见的标签 (前20个):
Interdisciplinary (跨学科): 337 次
Data Science (数据科学): 258 次
Statistics (统计学): 200 次
Computer Science (计算机科学): 109 次
Mathematics (数学): 94 次
Machine Learning (机器学习): 86 次
Data Analytics (数据分析): 50 次
Artificial Intelligence (人工智能): 41 次
Analytics (分析学): 35 次
Applied Mathematics (应用数学): 22 次
Operations Research (运筹学): 17 次
Information Science (信息科学): 16 次
Industrial Engineering (工业工程): 15 次
Engineering (工程学): 14 次
Electrical Engineering (电气工程): 13 次
Data Engineering (数据工程): 13 次
Applied Statistics (应用统计学): 12 次
Computer Engineering (计算机工程): 11 次
Programming (编程): 11 次
Cybersecurity (网络安全): 11 次
