In [1]:
import pandas as pd
import sys
import os
import json
from tqdm import tqdm
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience')
import asyncio
from call_api import call_gemini, async_call_gemini
from tqdm.asyncio import tqdm_asyncio

field_name = "更多项目信息"
field_path = f"/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_csv/{field_name}.csv"
field_df = pd.read_csv(field_path)

field_df.shape

(351, 9)

In [2]:
prompt_template = """
You are an assistant whose only task is to extract and concisely summarize the **key academic and career-oriented selling points** of the graduate program below.

────────────────────────────────────────────────────────
What to include  
• Ph.D. programs → main research areas, faculty strengths, lab resources, interdisciplinary links.  
• Academic Master’s → signature courses, research or thesis options, unique concentrations.  
• Professional Master’s → industry partnerships, internship / practicum opportunities, career services, ROI claims.  
Goal: deliver **3-5 crisp bullet points** that highlight the program’s distinct features or advantages.

────────────────────────────────────────────────────────
How to gather information  

1. **Primary *.edu* sources only**  
   • Follow the Program URLs provided.  
   • You may open any additional pages under the same university’s *.edu* domain (e.g., “About the Program”, “Research”, “Careers”, “Why Choose Us”).  
   ⛔  Ignore non-*.edu* sites, blogs, rankings, or promo videos without text.

2. **Optional Google search**  
   Single query:  
   "{university} {degree} {program} overview research career site:.edu"  
   Examine only *.edu* results.

3. **Summarize**  
   • Extract concrete selling points (avoid generic statements like “world-class faculty”).  
   • Each bullet ≤ 25 words, start with a strong noun phrase; skip long subjects like “The program offers…”.  
   • Provide at least **3** and at most **5** bullets.  
   • If no reliable *.edu* info is found, output **Not found**.  
   • Never invent.

────────────────────────────────────────────────────────
⚠️  Output format (exactly one line per bullet; no extra text before or after)

Example of valid output:  
• Three research tracks: AI systems, probabilistic ML, and human-AI interaction  
• 12-month capstone + industry practicum with Fortune-500 partners  
• NSF-funded labs and dedicated GPU cluster for Ph.D. students  
• STEM-OPT eligible, 95 % job placement within six months  

If nothing can be confirmed:  
Not found


────────────────────────────────────────────────────────
• Admissions URL: {admissions_url}  
• Program URL:    {program_url}

Provide the program snapshot:
"""


In [3]:
import os
import json
import asyncio
from tqdm.asyncio import tqdm_asyncio

# Async Gemini wrapper
from call_api import async_call_gemini

# ---------------------------------------------------------------------------
# Concurrency guard – avoid hitting rate-limits
# ---------------------------------------------------------------------------
semaphore = asyncio.Semaphore(4)            # max concurrent rows

# ---------------------------------------------------------------------------
# Per-row worker
# ---------------------------------------------------------------------------
async def process_row(row, prompt_template, num_vote: int, model_name: str):
    """
    1. Format the prompt for this row
    2. Launch `num_vote` Gemini calls in parallel
    3. Capture BOTH normal answers *and* every possible error case
    4. Return a serialisable record
    """
    async with semaphore:
        row    = row.to_dict()
        prompt = prompt_template.format(
            university     = row["大学英文名称"],
            degree         = row["学位"],
            program        = row["专业英文名称"],
            department     = row["所属院系"],
            admissions_url = row["招生网址"],
            program_url    = row["专业网址"],
        )

        record: dict = row.copy()
        record["llm_reponses"] = {}

        # -------- launch Gemini calls in parallel --------------------
        tasks = [
            async_call_gemini(
                prompt,
                model_name=model_name,
                use_search=True,
                url_context=True
            )
            for _ in range(num_vote)
        ]
        responses = await asyncio.gather(*tasks)

        # -------- post-process each response -------------------------
        for i, response in enumerate(responses):
            resp_key = f"response {i+1}"

            # -- 1. Transport / server-side errors (string starting "Error:")
            if isinstance(response, str) and response.startswith("Error:"):
                record["llm_reponses"][resp_key] = {
                    "error": response                       # e.g. "Error: 429 Rate limit …"
                }
                continue

            # -- 2. Empty / malformed response objects
            if not hasattr(response, "candidates") or not response.candidates:
                record["llm_reponses"][resp_key] = {
                    "error": "No candidates returned",
                    "raw_response": str(response)
                }
                continue

            # -- 3. Extract main answer text
            try:
                text = response.candidates[0].content.parts[0].text
            except Exception as e:
                record["llm_reponses"][resp_key] = {
                    "error": f"Cannot parse text: {e}",
                    "raw_response": str(response)
                }
                continue

            # -- 4. Extract additional metadata (best-effort)
            try:
                url_context = str(response.candidates[0].url_context_metadata)
            except Exception:
                url_context = "Not used"

            try:
                search_pages = (
                    f"Search Chunks: "
                    f"{response.candidates[0].grounding_metadata.grounding_chunks}"
                )
            except Exception:
                search_pages = "Not used"

            try:
                search_queries = (
                    f"Search Query: "
                    f"{response.candidates[0].grounding_metadata.web_search_queries}"
                )
            except Exception:
                search_queries = "Not used"

            try:
                search_support = (
                    f"Search Supports: "
                    f"{response.candidates[0].grounding_metadata.groundingSupports}"
                )
            except Exception:
                search_support = "Not used"

            # -- 5. Store normal answer + metadata + raw object
            record["llm_reponses"][resp_key] = {
                "response_text": text,
                "url_context": url_context,
                "search_queries": search_queries,
                "search_pages": search_pages,
                "search_support": search_support,
                "raw_response": str(response)             # keep for deep-debugging
            }

        return record

# ---------------------------------------------------------------------------
# Batch orchestrator with tqdm progress bar
# ---------------------------------------------------------------------------
async def request_and_store_async(prompt_template,
                                  field_df,
                                  num_vote: int,
                                  model_name: str,
                                  start_from: int = 0,
                                  end_at: int = -1):
    """
    Runs `process_row` over the dataframe slice asynchronously,
    shows a live tqdm bar, and dumps the results to JSON.
    """
    df = field_df.copy()[start_from:end_at]

    # Spawn tasks for every row in the slice
    tasks = [
        process_row(row, prompt_template, num_vote, model_name)
        for _, row in df.iterrows()
    ]

    # tqdm_asyncio.gather gives us progress updates as tasks complete
    response_records = await tqdm_asyncio.gather(*tasks)

    # Persist to disk ------------------------------------------------
    output_dir = f"../fields_records/{field_name}"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/{field_name}_{model_name}_{start_from}_{end_at}.json"

    with open(output_path, "w") as f:
        json.dump(response_records, f, ensure_ascii=False, indent=2)

    return response_records

In [4]:
import nest_asyncio
nest_asyncio.apply()  # Only needed in Jupyter

num_vote = 3
start_from = 0
end_at = -1
model_name = "gemini-2.5-flash"
response_records = asyncio.run(
    request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=start_from, end_at=end_at)
)

100%|██████████| 350/350 [23:58<00:00,  4.11s/it]


In [16]:
import json
import pandas as pd
import os

def extract_program_info_from_json(json_file_path):
    """
    从JSON文件中提取项目信息，选择response 1作为"更多项目信息"的值
    
    Args:
        json_file_path: JSON文件路径
    
    Returns:
        DataFrame: 处理后的数据
    """
    
    # 读取JSON文件
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    results = []
    
    for record in data:
        # 提取基本信息
        basic_info = {
            '大学英文名称': record.get('大学英文名称', '').strip(),
            '学位': record.get('学位', ''),
            '专业英文名称': record.get('专业英文名称', ''),
            '所属院系': record.get('所属院系', ''),
            '所属院系（英文）': record.get('所属院系（英文）', ''),
            '招生网址': record.get('招生网址', ''),
            '专业网址': record.get('专业网址', ''),
            '所属院系网址': record.get('所属院系网址', ''),
        }
        
        # 提取response 1的内容作为"更多项目信息"
        更多项目信息 = ''
        try:
            # 注意这里是 llm_reponses，不是 llm_responses
            if ('llm_reponses' in record and 
                'response 1' in record['llm_reponses'] and 
                'response_text' in record['llm_reponses']['response 1']):
                更多项目信息 = record['llm_reponses']['response 1']['response_text']
        except (KeyError, TypeError) as e:
            print(f"处理记录时出错: {record.get('大学英文名称', 'Unknown')} - {e}")
            更多项目信息 = ''
        # 将所有信息合并
        result = basic_info.copy()
        result['更多项目信息'] = 更多项目信息
        
        results.append(result)
    
    # 转换为DataFrame
    df = pd.DataFrame(results)
    
    # 生成输出文件路径（在同一文件夹中）
    output_dir = os.path.dirname(json_file_path)
    output_filename = '更多项目信息_提取结果.csv'
    output_path = os.path.join(output_dir, output_filename)
    
    # 保存为CSV文件
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    
    print(f"数据提取完成！")
    print(f"总共处理了 {len(results)} 条记录")
    print(f"结果已保存到: {output_path}")
    
    # 显示一些统计信息
    print(f"\n数据统计:")
    print(f"- 有更多项目信息的记录数: {df['更多项目信息'].notna().sum()}")
    print(f"- 缺失更多项目信息的记录数: {df['更多项目信息'].isna().sum()}")
    
    return df

json_file_path = "/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/更多项目信息/更多项目信息_gemini-2.5-flash_0_-1.json"

df = extract_program_info_from_json(json_file_path)

# 显示前几行数据预览
print("\n数据预览:")
print(df.head())

# 显示列信息
print(f"\nCSV文件包含以下列:")
for i, col in enumerate(df.columns, 1):
    print(f"{i}. {col}")

数据提取完成！
总共处理了 350 条记录
结果已保存到: /Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_records/更多项目信息/更多项目信息_提取结果.csv

数据统计:
- 有更多项目信息的记录数: 350
- 缺失更多项目信息的记录数: 0

数据预览:
                大学英文名称  学位                       专业英文名称  \
0   Harvard University  MS            Data Science SEAS   
1   Harvard University  MS                 Data Science   
2  Stanford University  MS                 Data Science   
3  Stanford University  MS                 Data Science   
4      Yale University  MS  Statistics and Data Science   

                       所属院系  \
0           约翰·保尔森工程与应用科学学院   
1  Harvard Extension School   
2                      工程学院   
3                    人文科学学院   
4                 艺术与科学研究生院   

                                            所属院系（英文）  \
0  Harvard John A. Paulson School of Engineering ...   
1                           Harvard Extension School   
2                              School of Engineering   
3                  School of Hum