In [2]:
import pandas as pd
import sys
import os
import json
from tqdm import tqdm
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience')
import asyncio
from call_api import call_gemini, async_call_gemini
from tqdm.asyncio import tqdm_asyncio

field_name = "网申地址"
field_path = f"/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_csv/{field_name}.csv"
field_df = pd.read_csv(field_path)

field_df.shape

(351, 7)

In [3]:
prompt_template = """
You are an assistant that checks the application portal for the following graduate program.

You should use Google search to find the application portal. Note that you are looking for the application portal for the program where applicants submit their applications, not the program overview website.
In most cases, the application portal is just a login page where the graduate school manages all graduate programs application. It contains a login button and a create account button.

Answer instructions:
If you find the website, just return the website url and nothing else. If not found, return "Not found".

Example response:
"Not found"
"https://apply.gsas.harvard.edu/portal/apply_now"
"https://apply-psd.uchicago.edu/apply/"
"https://applygp.duke.edu/apply/?sr=ff282888-94bf-4e9e-af2b-868e6f1c72a1"

Use Google to search **"{university} {degree} {program} {department} application portal"** for more information.

URLs you should check:
• Admissions URL: {admissions_url}  
• Program URL: {program_url}

Here are your response. Note that either return the website url or "Not found", with nothing else:
"""

In [6]:
semaphore = asyncio.Semaphore(3) 

async def process_row(row, prompt_template, num_vote, model_name):
    async with semaphore:
        row = row.to_dict()
        prompt = prompt_template.format(
            university     = row["大学英文名称"],
            degree         = row["学位"],
            program        = row["专业英文名称"],
            department     = row["所属院系"],
            admissions_url = row["招生网址"],
            program_url    = row["专业网址"],
        )
        record = row.copy()
        record["llm_reponses"] = {}

        # Launch all API calls in parallel for this row
        tasks = [
            async_call_gemini(prompt, model_name=model_name, use_search=True, url_context=True)
            for _ in range(num_vote)
        ]
        responses = await asyncio.gather(*tasks)

        for i, response in enumerate(responses):
            try:
                text = response.candidates[0].content.parts[0].text
            except:
                text = ''
            try:
                url_context = str(response.candidates[0].url_context_metadata)
            except:
                url_context = "Not used"
            try:
                search_pages = f"Search Chunks: {response.candidates[0].grounding_metadata.grounding_chunks}"
            except:
                search_pages = "Not used"
            try:
                search_queries = f"Search Query: {response.candidates[0].grounding_metadata.web_search_queries}"
            except:
                search_queries = "Not used"
            try:
                search_support = f"Search Query: {response.candidates[0].grounding_metadata.groundingSupports}"
            except:
                search_support = "Not used"

            record["llm_reponses"][f"response {i+1}"] = {
                "response_text": text,
                "url_context": url_context,
                "search_queries": search_queries,
                "search_pages": search_pages,
                "search_support": search_support,
            }
        return record

async def request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=0, end_at=-1):
    df = field_df.copy()[start_from:end_at]
    response_records = []

    # Create tasks for all rows
    tasks = [
        process_row(row, prompt_template, num_vote, model_name)
        for _, row in df.iterrows()
    ]
    # Run all row tasks in parallel (limit concurrency if needed)
    response_records = await tqdm_asyncio.gather(*tasks)

    # Save results
    with open(f"../fields_records/{field_name}/{field_name}_{model_name}_{start_from}_{end_at}.json", "w") as f:
        json.dump(response_records, f, ensure_ascii=False, indent=2)
    return response_records

In [7]:
import nest_asyncio
nest_asyncio.apply()  # Only needed in Jupyter

num_vote = 3
start_from = 0
end_at = 10
model_name = "gemini-2.5-flash"
response_records = asyncio.run(
    request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=start_from, end_at=end_at)
)

100%|██████████| 10/10 [01:21<00:00,  8.16s/it]


In [None]:
def majority_vote(records):
    for rec in records:
        responses = rec["llm_reponses"]
        candidate_answers = [r for r in responses.values() if "no explicit constraints" not in r]
        if len(candidate_answers) > 0:
            rec["同申互斥"] = candidate_answers[0]
        else:
            rec["同申互斥"] = "no explicit constraints"
    return records
num_vote = 3
model_name = "gemini-2.5-flash"
field_name = "同申互斥"
accuracy = "median_accuracy"
with open(f"../fields_records/{field_name}_{model_name}_0_-1.json", "r") as f:
    records = json.load(f)
records = majority_vote(records)
with open(f"../fields_records/{field_name}_{model_name}_{accuracy}.json", "w") as f:
    json.dump(records, f, ensure_ascii=False, indent=2)