In [1]:
import pandas as pd
import sys
import os
import json
from tqdm import tqdm
sys.path.append('/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience')
import asyncio
from call_api import call_gemini, async_call_gemini
from tqdm.asyncio import tqdm_asyncio

field_name = "同申互斥"
field_path = f"/Users/yijingyang/Library/CloudStorage/OneDrive-个人/GradPilot/ProgramDB/DataScience/fields_csv/{field_name}.csv"
field_df = pd.read_csv(field_path)

field_df.shape

(351, 7)

In [2]:
prompt_template = """
You are an assistant whose only task is to verify whether the graduate program below **restricts applicants from applying to additional programs in the same admissions cycle**.

Typical patterns (for context, not assumption)
• Many schools permit only **one** application within the *same department / division / school* (e.g., Engineering).  
• Some universities cap the **total** number of graduate applications campus-wide (e.g., “maximum two programs per year”).  
• Cross-school combinations (e.g., Engineering + Public Health) are often allowed unless the university sets a global limit.

────────────────────────────────────────────────────────
How to confirm the rule

1. **Primary *.edu* sources only**  
   • Review the Admissions and Program URLs supplied.  
   • You may open any additional pages under the same university’s *.edu* domain (e.g., “Application Instructions”, “FAQ”, “Graduate Admissions Policies”).  
   ⛔  Ignore non-*.edu* sites, rankings, blogs, or forums.

2. **Optional Google search**  
   Query once:  
   "{university} {degree} {program} multiple applications same school department limit site:.edu"  
   Check *.edu* results only until you find an authoritative statement.

3. **Decision rules**  
   • If a page clearly states a **limit** (e.g., “Applicants may apply to only one program per school” or “One application per academic year for MS or PhD in this department”) → quote that restriction in a **single concise sentence**.  
   • If the university sets a campus-wide cap (e.g., “You may submit up to two graduate applications total”) → state that rule.  
   • If no explicit constraint is found on an *.edu* page, output **No explicit constraints**.  
   • Never infer or invent.

────────────────────────────────────────────────────────
⚠️  Output format (exactly one line, no quotes, no extra text)  

Examples of valid outputs:  
Only one application per department is allowed each year  
Applicants may submit a maximum of two graduate applications university-wide  
No explicit constraints  

────────────────────────────────────────────────────────
Pages to consult first:
• Admissions URL: {admissions_url}  
• Program URL:    {program_url}

Are there constraints on applying to multiple programs?
"""


In [3]:
semaphore = asyncio.Semaphore(5) 

async def process_row(row, prompt_template, num_vote, model_name):
    async with semaphore:
        row = row.to_dict()
        prompt = prompt_template.format(
            university     = row["大学英文名称"],
            degree         = row["学位"],
            program        = row["专业英文名称"],
            department     = row["所属院系"],
            admissions_url = row["招生网址"],
            program_url    = row["专业网址"],
        )
        record = row.copy()
        record["llm_reponses"] = {}

        # Launch all API calls in parallel for this row
        tasks = [
            async_call_gemini(prompt, model_name=model_name, use_search=True, url_context=True)
            for _ in range(num_vote)
        ]
        responses = await asyncio.gather(*tasks)

        for i, response in enumerate(responses):
            try:
                text = response.candidates[0].content.parts[0].text
            except:
                text = ''
            try:
                url_context = str(response.candidates[0].url_context_metadata)
            except:
                url_context = "Not used"
            try:
                search_pages = f"Search Chunks: {response.candidates[0].grounding_metadata.grounding_chunks}"
            except:
                search_pages = "Not used"
            try:
                search_queries = f"Search Query: {response.candidates[0].grounding_metadata.web_search_queries}"
            except:
                search_queries = "Not used"
            try:
                search_support = f"Search Query: {response.candidates[0].grounding_metadata.groundingSupports}"
            except:
                search_support = "Not used"

            record["llm_reponses"][f"response {i+1}"] = {
                "response_text": text,
                "url_context": url_context,
                "search_queries": search_queries,
                "search_pages": search_pages,
                "search_support": search_support,
            }
        return record

async def request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=0, end_at=-1):
    df = field_df.copy()[start_from:end_at]
    response_records = []

    # Create tasks for all rows
    tasks = [
        process_row(row, prompt_template, num_vote, model_name)
        for _, row in df.iterrows()
    ]
    # Run all row tasks in parallel (limit concurrency if needed)
    response_records = await tqdm_asyncio.gather(*tasks)

    # Save results
    with open(f"../fields_records/{field_name}/{field_name}_{model_name}_{start_from}_{end_at}_newprompt.json", "w") as f:
        json.dump(response_records, f, ensure_ascii=False, indent=2)
    return response_records

In [4]:
import nest_asyncio
nest_asyncio.apply()  # Only needed in Jupyter

num_vote = 3
start_from = 0
end_at = -1
model_name = "gemini-2.5-pro"
response_records = asyncio.run(
    request_and_store_async(prompt_template, field_df, num_vote, model_name, start_from=start_from, end_at=end_at)
)

100%|██████████| 350/350 [51:52<00:00,  8.89s/it]    


In [None]:
def majority_vote(records):
    for rec in records:
        responses = rec["llm_reponses"]
        candidate_answers = [r for r in responses.values() if "no explicit constraints" not in r]
        if len(candidate_answers) > 0:
            rec["同申互斥"] = candidate_answers[0]
        else:
            rec["同申互斥"] = "no explicit constraints"
    return records
num_vote = 3
model_name = "gemini-2.5-flash"
field_name = "同申互斥"
accuracy = "median_accuracy"
with open(f"../fields_records/{field_name}_{model_name}_0_-1.json", "r") as f:
    records = json.load(f)
records = majority_vote(records)
with open(f"../fields_records/{field_name}_{model_name}_{accuracy}.json", "w") as f:
    json.dump(records, f, ensure_ascii=False, indent=2)